爬虫流程
- 定义url,伪造headers
- 请求数据
- 解析数据
- 提取数据
- 持久化保存
实例
import requests
from lxml import etree
import time
import csv
# 1,定义url,伪造headers
url = "https://fm.qq.com/category/39087_38979"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"
}
# 2,请求数据
response = requests.get(url=url,headers=headers).text
# 3,解析数据
tree = etree.HTML(response)
html_li = tree.xpath('//*[@id="j-album-list"]/li')
with open('糗百.csv', 'w', encoding='gbk', newline='') as f: # 创建文件对象(指定文件名,模式,编码方式)
csv_writer = csv.writer(f) # 基于文件对象构建 csv写入对象
csv_writer.writerow(["标题", &#