一.分析
1.url:
http://www.haoduanzi.com/category/?1-1.html
http://www.haoduanzi.com/category/?1-2.html
所以可以设置为url=‘http://www.haoduanzi.com/category/?1-{}.html’
2.编码:utf-8
3.每个段子的标签:div_list = tree.xpath('//div[@class="left"]/ul[@class="list-box"]/li[not(@class)]')
4…标题:title = odiv.xpath('.//h2/text()')[0]
5.每个段子的内容:text = odiv.xpath('.//div[@class="content"]/a/p/text()')
注意:这里从第二页开始,因为第一页中代码有点小瑕疵,有的内容不在p标签中
6.赞和踩:good_bad_lt = odiv.xpath('.//div[@class="ping x1"]/a[@class="good" or @class="bad"]/span/text()')
二.代码
import urllib.request
import urllib.parse
from lxml import etree
import time
import json
item_list = []
def handle_request(url, page):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
}
# 拼接url
url = url.format(page) # http://www.haoduanzi.com/category/?1-1.html
# print(url)
request = urllib.request.Request(url=url, headers=headers)
return request
def parse_content(content):
# 生成对象
tree = etree.HTML(content)
# 抓取内容,过滤不需要的li标签(li[not(@class)])
div_list = tree.xpath('//div[@class="left"]/ul[@class="list-box"]/li[not(@class)]')
# print(div_list)
# print(len(div_list)) # 一页10个
for odiv in div_list:
# 获取标题
title = odiv.xpath('.//h2/text()')[0]
# print(title)
# 获取每个内容
text_lt = odiv.xpath('.//div[@class="content"]/a/p/text()')
text = '\n'.join(text_lt)
# print(text)
# 获取点赞和踩
good_bad_lt = odiv.xpath('.//div[@class="ping x1"]/a[@class="good" or @class="bad"]/span/text()')
good_bad = '\t'.join(good_bad_lt)
# print(good_bad)
item = {
'标题': title,
'内容': text,
'赞和踩': good_bad
}
# 将类容添加到列表中
item_list.append(item)
def main():
start_page = int(input("请输入起始页码:"))
end_page=int(input("请输入结束页码:"))
url = 'http://www.haoduanzi.com/category/?1-{}.html'
for page in range(start_page, end_page+1):
print("第%s页开始爬取······" % page)
request = handle_request(url, page)
content = urllib.request.urlopen(request).read().decode()
# 解析内容
parse_content(content)
time.sleep(2)
with open('haoduanzi.txt', 'w', encoding='utf8') as fp:
fp.write(str(item_list))
print("第%s页爬取完成······" % page)
print("爬取结束!!!")
if __name__ == "__main__":
main()