1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 4 import urllib2 5 import json 6 from lxml import etree 7 8 url = "http://www.qiushibaike.com/8hr/page/2/" 9 headers = {"User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"} 10 11 request = urllib2.Request(url, headers = headers) 12 13 html = urllib2.urlopen(request).read() 14 # 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html) 15 16 text = etree.HTML(html) 17 # 返回所有段子的结点位置,contains()模糊查询方法,第一个参数是要匹配的标签,第二个参数是标签名部分内容 18 node_list = text.xpath('//div[contains(@id, "qiushi_tag")]') 19 20 items ={} 21 for node in node_list: 22 # xpath返回的列表,这个列表就这一个参数,用索引方式取出来,用户名 23 username = node.xpath('./div/a/@title')[0] 24 # 图片连接 25 image = node.xpath('.//div[@class="thumb"]//@src')#[0] 26 # 取出标签下的内容,段子内容 27 content = node.xpath('.//div[@class="content"]/span')[0].text 28 # 取出标签里包含的内容,点赞 29 zan = node.xpath('.//i')[0].text 30 # 评论 31 comments = node.xpath('.//i')[1].text 32 33 items = { 34 "username" : username, 35 "image" : image, 36 "content" : content, 37 "zan" : zan, 38 "comments" : comments 39 } 40 41 with open("qiushi.json", "a") as f: 42 f.write(json.dumps(items, ensure_ascii = False).encode("utf-8") + "\n")