例子:
from lxml import html
from lxml import etree
import re
dom = html.fromstring(urllib2.urlopen(x[1]).read()) #取出html文本并转为
for div in dom.xpath("//div[@class='trip-wps']/div[@class='trip-days']"):
inner_html = etree.tostring(div)
print div.attrib.get('id','day-default'),inner_html
0.etree.tostring(): 生成Element对象字符串
1.Element对象:
lxml.etree.Element(arg1,arg2): Element创建一个Element对象,arg1是标签名,arg2是一个键值对propkey=“propvalue”
root = etree.Element("root",prop1 = "abc")
print root.tag,root.get("prop1")
root abc
.set(arg1,arg2): 改变或增加一个属性, (key,value)
.get(arg1,arg2): (key,defaultvalue),获得一个属性的值,否则为默认值arg2
.attrib(): 返回一个attrib字典,可以用字典的方法来操作标签的所有属性
.text: 标签的值,直接赋值
.tag(): 返回标签名字符串
root = etree.Element("root",prop1 = "abc")
print root.tag,root.get("prop1")
root.set("prop3","345")
print root.get("prop2","no_prop"),root.attrib["prop1"],root.attrib.get("prop3","233")
attribs = root.attrib
attribs["prop2"] = "1234"
print root.get("prop2"," ")
root.text = "TEXT"
print etree.tostring(root,pretty_print=True)
root abc
no_prop abc 345
1234
<root prop1="abc" prop3="345" prop2="1234">TEXT</root>
.append(Element()): 为标签添加一个子节点
#etree.SubElement(tag,Element()): 为指定节点增加一个子节点
.insert(index,Element()): 在第index的位置插入一个子节点
.index()
root.append(etree.Element("child1"))
print etree.tostring(root,pretty_print=True)
root.insert(0,etree.Element("child0"))
print type(root)
for item in root:
print item.tag
child2 = etree.SubElement(root,"child2")
child3 = etree.SubElement(child2,"child3")
child4 = etree.SubElement(child2,"child4")
print etree.tostring(root,pretty_print=True)
print len(root),len(child2),len(child3)
print root.index(root[0]),root.index(root[2])
<root prop1="abc" prop3="345" prop2="1234">TEXT<child1/></root>
<type 'lxml.etree._Element'>
child0
child1
<root prop1="abc" prop3="345" prop2="1234">TEXT<child0/><child1/><child2><child3/><child4/></child2></root>
3 2 0
0 2
list(Element()): 返回一个Element的列表,包含所有的子节点
root[i] = root[i-1]: 删除第i个子节点
childs = list(root)
print childs
root[1] = root[0]
print etree.tostring(root,pretty_print=True)
[<Element child0 at 0x13fc1b88>, <Element child1 at 0x1405e4c8>, <Element child2 at 0x13da0f88>]
<root prop1="abc" prop3="345" prop2="1234">TEXT<child0/><child2><child3/><child4/></child2></root>
.getparent(): 获得子节点的父节点
.getnext(): 获得下一个(相邻)节点
.getprevious(): 获得上一个(相邻)节点
print child3.getparent().tag
print child3.getnext()
print child3.getprevious()
print child2.getprevious()
child2
<Element child4 at 0x139e7688>
None
<Element child0 at 0x13fc1b88>
.tail :在标签末到下一个标签头的值,直接赋值
child2.tail = "TAIL"
print etree.tostring(root,pretty_print=True)
<root prop1="abc" prop3="345" prop2="1234">TEXT<child0/><child2><child3/><child4/></child2>TAIL</root>
.xpath(arg): (查找路径)
html = etree.Element("html")
body = etree.SubElement(html, "body")
body.text = "TEXT"
etree.tostring(html)
br = etree.SubElement(body, "br")
etree.tostring(html)
br.tail = "TAIL"
print etree.tostring(html)
<html><body>TEXT<br/>TAIL</body></html>
print html.xpath("string()")#获得文本
print html.xpath("//text()")#获得标签的值
text_list = etree.XPath("//text()")#封装一个XPath()方法
texts = text_list(html)
print texts
print texts[0].getparent()#可以根据标签的值找到标签
print texts[1].getparent()
print texts[0].is_text#判断这个值是在标签头/末(标签末的值并不属于这个标签)
print texts[1].is_text
print texts[1].is_tail
.iter(arg): (过滤参数),返回Flement对象的迭代器:过滤参数可以制定一个节点名,或者一种对象类型
etree.SubElement(child2,"child3").text = "5"
for item in root.iter():
print item.tag,item.text
for item in root.iter("child3"):
print item.tag,item.text
for item in root.iter(tag=etree.Element):
print item.tag,item.text
root TEXT
child0 None
child2 None
child3 None
child4 4
child3 5
child3 None
child3 5
root TEXT
child0 None
child2 None
child3 None
child4 4
child3 5
2.序列化:
etree.tostring()/ElementTree.write(): (Element(),[pretty_print,encoding,method])使用同样的参数,返回一个字符串/写入一个文件(类对象、url方法)
hroot = etree.HTML(etree.tostring(root))
#hroot = etree.XML(etree.tostring(root))
print etree.tostring(hroot,method="html",pretty_print=True)
print etree.tostring(hroot,method="xml",pretty_print=True,encoding="utf-8")
print etree.tostring(hroot,method="text",pretty_print=True)
print root.xpath(".//child3")
print root.find(".//child3")
<html><body><root prop1="abc" prop3="345" prop2="1234">TEXT<child0></child0><child2><child3></child3><child4>4</child4><child3>5</child3></child2>TAIL</root></body></html>
<html>
<body>
<root prop1="abc" prop3="345" prop2="1234">TEXT<child0/><child2><child3/><child4>4</child4><child3>5</child3></child2>TAIL</root>
</body>
</html>
TEXT45TAIL
[<Element child3 at 0x13de3e08>, <Element child3 at 0x13de3e48>]
<Element child3 at 0x13de3e08>
3.ElementTree对象:一个ElementTree主要是围绕在一个有根节点的树的文档包装类。它提供了很多方法来解析,序列化以及一般的文档处理。一个最大的区别是它作为一个整体文档来序列化。与之相对的是序列化成单个的元素。
etree.parse(arg): (html文件),解析html文件并创建一个ElementTree对象
etree.fromstring(arg): (字符串/html文件),解析字符串并创建ElementTree对象
例子中的
dom = html.fromstring(urllib2.urlopen(x[1]).read())
也可以写为
dom = html.parse(urllib2.urlopen(x[1]))#lxml.html返回HtmlElement对象,是Element的子类
.getroot(): 返回tree的根节点,即Element root
4.解析器Parser 略。