爬取糗事百科,并生成相应的csv文件
代码:
import csv
import requests
from lxml import etree
Headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36 Edg/89.0.774.48"
}
Base_url = "https://www.qiushibaike.com"
c = open("test.csv", "w",encoding='utf-8',newline='')
writer = csv.writer(c)
writer.writerow(['题目','段子内容'])
qiushis = []
def spider():
url = "https://www.qiushibaike.com/text/page/{}/"
# 循环列表页
for x in range(1, 2):
url = url.format(x)
detail_urls = get_detail_urls(url)
# 循环详情页的
for detail_url in detail_urls:
qiushi = parse_detail(detail_url)
qiushis.append(qiushi)
print(qiushi)
c.close()
# 解析详情页的内容
def parse_detail(url):
qiushi = {}
resp = requests.get(url, headers=Headers)
text = resp.content.decode("utf-8")
html = etree.HTML(text)
DivE = html.xpath("//div[@class='col1 new-style-col1']")[0]
title = DivE.xpath("//h1[@class='article-title']/text()")[0]
qiushi["title"] = title.replace('\n\n', '')
content = DivE.xpath("//div[@class='content']/text()")[0]
qiushi["content"] = content
templist = []
templist.append(qiushi["title"])
templist.append(qiushi["content"])
writer.writerow(templist)
return qiushi
# 拿到详情页的地址
def get_detail_urls(url):
res = requests.get(url, headers=Headers)
text = res.text
html = etree.HTML(text)
detail_urls = html.xpath("//div[@class='article block untagged mb15 typs_hot']/a[@class='contentHerf']/@href")
detail_urls = map(lambda url: Base_url + url, detail_urls)
return detail_urls
print(__name__)
if __name__ == '__main__':
spider()
运行结果: