五十行代码爬取糗事百科段子
使用到的模块
- requests
- lxml
代码(使用类)
import requests
from lxml import etree
class Qiubai:
def __init__(self):
self.headers = {
"sec - fetch - dest": "empty",
"sec - fetch - mode": "cors",
"sec - fetch - site": "cross - site",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36"}
def get_html(self, url):
response = requests.get(url, headers=self.headers)
return response.content.decode()
def get_url_list(self):
url = "https://www.qiushibaike.com/text/page/{}/"
url_list = []
for i in range(13):
url_list.append(url.format(i))
return url_list
def run(self):
url_list = self.get_url_list()
for url in url_list:
html_str = self.get_html(url)
html = etree.HTML(html_str)
content_list = html.xpath('//div[@class="content"]')
with open("糗事百科搞笑段子.doc", "a", encoding="utf-8") as f:
for i in content_list:
i = i.xpath("./span/text()")
for j in i:
a = j.replace("\n", "")
print(type(a))
f.write(a)
f.write("\n\n\n")
if __name__ =="__main__":
qiubai = Qiubai()
qiubai.run()
运行结果截图
![在这里插入图片描述](https://i-blog.csdnimg.cn/blog_migrate/f89a11fcf7150d70f3c8f0167dca41ac.png)
- 构造url列表(因为已经知道有多少页了)
- 发送请求获取响应
- 使用xpath提取数据
- 保存