lxml实战（python2.7)

最新推荐文章于 2022-07-03 20:46:21 发布

weixin_30633507

最新推荐文章于 2022-07-03 20:46:21 发布

阅读量107

点赞数

文章标签： python json

原文链接：http://www.cnblogs.com/realmonkeykingsun/p/7874081.html

版权

 1 #!/usr/bin/env python
 2 # -*- coding:utf-8 -*-
 3 
 4 import urllib2
 5 import json
 6 from lxml import etree
 7 
 8 url = "http://www.qiushibaike.com/8hr/page/2/"
 9 headers = {"User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}
10 
11 request = urllib2.Request(url, headers = headers)
12 
13 html = urllib2.urlopen(request).read()
14 # 响应返回的是字符串，解析为HTML DOM模式 text = etree.HTML(html)
15 
16 text = etree.HTML(html)
17 # 返回所有段子的结点位置，contains()模糊查询方法，第一个参数是要匹配的标签，第二个参数是标签名部分内容
18 node_list = text.xpath('//div[contains(@id, "qiushi_tag")]')
19 
20 items ={}
21 for node in node_list:
22     # xpath返回的列表，这个列表就这一个参数，用索引方式取出来，用户名
23     username = node.xpath('./div/a/@title')[0]
24     # 图片连接
25     image = node.xpath('.//div[@class="thumb"]//@src')#[0]
26     # 取出标签下的内容,段子内容
27     content = node.xpath('.//div[@class="content"]/span')[0].text
28     # 取出标签里包含的内容，点赞
29     zan = node.xpath('.//i')[0].text
30     # 评论
31     comments = node.xpath('.//i')[1].text
32 
33     items = {
34         "username" : username,
35         "image" : image,
36         "content" : content,
37         "zan" : zan,
38         "comments" : comments
39     }
40 
41     with open("qiushi.json", "a") as f:
42         f.write(json.dumps(items, ensure_ascii = False).encode("utf-8") + "\n")