)
appendChild方法可以创建节点的结构
节点的方法 insertBefor(newChild,refChild)可以在节点的子节点列表中的任意位置插入新的子节点
方法replaceChild(newChild,oldChild)可以将一个子节点替换为别一个子节点
删除节点 首先需要得到要删除的节点的引用,随后再调用 removeChild(childNode)方法。删除后,调用unlink()方法强制对被删除的节点及它可能连接的子节点进行垃圾回收,xml.dom中不可用。
xml.dom.minidom方法:toprettyxml,它接收两个可选参数:一个是缩进字符串,一个是换行符。如果没有指定参数值,这两个参数分别默认为tabulator 和\n。该方法将DOM 打印为包含良好缩进的XML
<?xml version="1.0"?>
<lib:library
xmlns:lib="http://server.domain.tld/NameSpaces/Library">
<lib:book>
<lib:title>Sandman volumn</lib:title>
<lib:author>Neil Gaiman</lib:author>
</lib:book>
<lib:book>
<lib:title>Good omens</lib:title>
<lib:author>Neil Gamain</lib:author>
<lib:author>Terry Pratchett</lib:author>
</lib:book>
<lib:book>
<lib:title>"Repent,harlequin!" said the man</lib:title>
<lib:author>Harlan Ellison</lib:author>
</lib:book>
</lib:library>
以上的xml 保存在文件里去掉命名空间可以使用以下代码操作
import os
from xml.dom.minidom import parse
import xml.dom.minidom
def printLibrary(library):
books=myLibrary.getElementsByTagName("book")
for book in books:
print("*******book******")
print("Title:%s"%book.getElementsByTagName("title")[0].childNodes[0].data)
for author in book.getElementsByTagName("author"):
print("author:%s"%author.childNodes[0].data)
# open an xml file and parse it into a dom
myDoc=parse(r'E:\pythonscript\ch15\library.xml')
myLibrary=myDoc.getElementsByTagName("library")[0]
#get all the book elements in the library
books=myLibrary.getElementsByTagName("book")
#Insert a new book in the library
newBook=myDoc.createElement("book")
newBookTitle=myDoc.createElement("title")
titleText=myDoc.createTextNode("Beginning Python")
newBookTitle.appendChild(titleText)
newBook.appendChild(newBookTitle)
newBookAuthor=myDoc.createElement("author")
authorName=myDoc.createTextNode("Peter Norton,et al")
newBookAuthor.appendChild(authorName)
newBook.appendChild(newBookAuthor)
myLibrary.appendChild(newBook)
print("--------added a new book!")
#printLibrary(myLibrary)
#remove a book from the library
#find ellison book
for book in myLibrary.getElementsByTagName("book"):
for author in book.getElementsByTagName("author"):
if author.childNodes[0].data.find("Ellison")>=0:
print(author.childNodes[0].data)
removedBook=myLibrary.removeChild(book)
removedBook.unlink()
print("------------removed a book.")
#printLibrary(myLibrary)
print(myDoc.toprettyxml())
#write back to the library file
lib=open(r"E:\pythonscript\ch15\library.xml","w")
lib.write(myDoc.toprettyxml(" "))
lib.close()# 这里是个方法,如果没有 这个方法 则不能写入数据,文件一直被占用
使用sax 解析:
#!/usr/bin/python
from xml.sax import make_parser
from xml.sax.handler import ContentHandler
#begin bookHandler
class bookHandler(ContentHandler):
inAuthor=False
inTitle=False
def startElement(self,name,attributes):
if name=="book":
print("********book*********")
if name=="title":
self.inTitle=True
print("Title:",)
if name=="author":
self.inAuthor=True
print("Author:",)
def endElement(self,name):
if name=="title":
self.inTitle=False
if name=="author":
self.inAuthor=False
def characters(self,content):
if self.inTitle or self.inAuthor:
print(content)
#end bookHandler
parser=make_parser()
parser.setContentHandler(bookHandler())
parser.parse("library.xml")
解析器xml.sax使用Handler对象解析文档过程中发生的事件。Handler可能是ContentHandler/DTDHandler /EntityResolver/ErrorHandler一个sax应用程序必须实现符合这些接口的处理程序类,并为解析器设置处理程序
接口ContentHandler包含了被文件事件触发的方法,例如元素和字符数据的开始和结束等。在解析字符数据时,解析器可以选择将结果作为一整块数据返回,或者作为若干小的以空白分隔的数据块返回,所以在处理一块文本的过程中需要反复调用characters方法。
make_parser方法创建一个新的解析器对象并将它返回。
6 lxml 使用cmd.exe pip install lxml 安装lxml
lxml是python利用libxml2 和libxslt库的快速、丰富特性的唯一绑定,并且它通过一个简单的api允许处理HTML /xml
包lxml使用了略作修改的ElementTreeAPI
导入lxml:import lxml
from lxml import etree
元素类:元素是ElementTreeAPI的主要容器对象,提供了xml树功能的核心,它们拥有属性并且包含文本.
元素类遵守标准的xml树层次,因此既能支持父元素也能支持子元素。
>>> import lxml
>>> from lxml import etree
>>> author=etree.Element("Horror") #创建新的元素类author,并赋予一个标签名称:Horror
>>> print(author.tag)
Horror
>>> writer1=etree.SubElement(author,"NeilGaiman")# 一个元素的子元素 创建一个新的子元素 ,它的标签是NeilGaiman,父元素是author
>>> writer2=etree.SubElement(author,"StephenKing")
>>> writer3=etree.SubElement(author,"CliveBarker")
>>> print(etree.tostring(author))
b'<Horror><NeilGaiman/><StephenKing/><CliveBarker/></Horror>'
>>> writer=author[0] #元素类也是列表,可以使用列表函数
>>> print(writer.tag)
NeilGaiman
>>> for writer in author:
print(writer.tag)
NeilGaiman
StephenKing
CliveBarker
元素可以包含属性,描述元素。
>>> author=etree.Element("author",audience="Adult")
>>> print(author.get("audience"))
Adult
get()方法可以从元素中提取数据,set()方法设置属性或都添加属性
>>> author.set("testpro","protect")
>>> etree.tostring(author)
b'<author audience="Adult" type="fiction" bestseller="Yes" testpro="protect"/>'
还可以向元素中添加文本
>>> html=etree.Element("html")
>>> body=etree.SubElement(html,"body")
>>> h1=etree.SubElement(body,"h1")
>>> h1.text="Introduction"
>>> paragraph=etree.SubElement(body,"p")
>>> paragraph.text="here is some text representing our paragraph"
>>> etree.tostring(html)
b'<html><body><h1>Introduction</h1><p>here is some text representing our paragraph</p></body></html>'
打印元素的文本:
>>> etree.tostring(paragraph,method="text")
b'here is some text representing our paragraph'
lxml解析函数:
fromstring()
>>> sentence="<info>here is a sentence</info>"
>>> info=etree.fromstring(sentence)
>>> print(info.tag)
info
>>> print(info.text)
here is a sentence
XML()
>>> info=etree.XML("<info>here is a sentence</info>")
>>> print(info.tag)
info
>>> print(info.text)
here is a sentence
>>> etree.tostring(info)
b'<info>here is a sentence</info>'
>>> import io
>>> newsentence=io.StringIO("<info>This is another sentence</info>")
>>> somesentence=etree.parse(newsentence)
>>> etree.tostring(somesentence)
b'<info>This is another sentence</info>'
>>> printit=somesentence.getroot()
>>> print(printit.tag)
info
>>> print(printit.text)
This is another sentence
dom解析xml
import xml.dom.minidom
from xml.dom.minidom import parse
dom1=parse(r'E:\pythonscript\ch15\config.xml')
myconfig=dom1.getElementsByTagName("config")[0]
dire=myconfig.getElementsByTagName("utilitydirectory")[0]
print(dire.childNodes[0].data)
uti=myconfig.getElementsByTagName("utility")[0]
print(uti.childNodes[0].data)
mode1=myconfig.getElementsByTagName("mode")[0]
print(mode1.childNodes[0].data)
#!/usr/bin/python
from xml.sax import make_parser
from xml.sax.handler import ContentHandler
class configHandler(ContentHandler):
isUtilDir=False
isUtil=False
isMode=False
def startElement(self,name,attributes):
if name=="utilitydirectory":
self.isUtilDir=True
print("------------utility directory-----",)
if name=="utility":
self.isUtil=True
print("--------------utility----------",)
if name=="mode":
self.isMode=True
print("---------------mode--------------",)
def endElement(self,name):
if name=="utilitydirectory":
isUtilDir=False
if name=="utility":
isUtil=False
if name=="mode":
isMode=False
def characters(self,content):
if self.isUtilDir or self.isUtil or self.isMode:
print(content)
parser=make_parser()
parser.setContentHandler(configHandler())
parser.parse(r"E:\pythonscript\ch15\config.xml")