import urllib.request
import urllib.parse
from lxml import etree
import os
import ssl
import json
ssl._create_default_https_context = ssl._create_unverified_context
url = 'http://neihanshequ.com/'
headers ={
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'
}
jock_list = []
def main():
request = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
html_tree = etree.HTML(html)
li_list = html_tree.xpath('//div[@class="content"]/ul[@id="detail-list"]/li/div')
for li in li_list:
jock = {}
#解析内容,获取头像、用户名、内容、赞、踩、收藏、分享、评论数据,保存到列表中
image_url = li.xpath('.//div[@class="header "]/a/img/@data-src')
user_name = li.xpath('.//div[@class="header "]/a/div/span[@class="name"]/text()')
content = li.xpath('.//div[@class="content-wrapper"]/a/div/p/text()')
support = li.xpath('.//div[@class="options"]/ul/li[@class="digg-wrapper "]/span[@class="digg"]/text()')
tread = li.xpath('.//div[@class="options"]/ul/li[@class="bury-wrapper "]/span[@class="bury"]/text()')
fovor = li.xpath('.//div[@class="options"]/ul/li[@class="repin-wrapper "]/span[@class="repin"]/text()')
share = li.xpath('.//div[@class="options"]/ul/li[@class="share-wrapper right"]/span[@class="share"]/text()')
comment = li.xpath('.//div[@class="options"]/ul/li[@class="post-comment-btn right"]/span[@class="comment J-comment-count"]/text()')
jock['image_url']=image_url
jock['user_name']=user_name
jock['content']=content
jock['support']=support
jock['tread']=tread
jock['fovor']=fovor
jock['share']=share
jock['comment']=comment
print(jock)
jock_list.append(jock)
#保存文件
with open('jock.text','w',encoding='utf-8') as f1:
jock_string = json.dumps(jock_list,ensure_ascii=False)
f1.write(jock_string)
if __name__ == '__main__':
main()
xpath解析页面,爬取段子
最新推荐文章于 2020-07-13 10:30:14 发布