说明
爬虫时有的数据在网页源代码中可以找到;
有的需要抓包获取数据(AJAX网页,个人理解:点击下一页时只需对页面的局部进行更新,网址不发生改变)。
构造请求头、发起请求、获取数据
import csv
url=url
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
}
params = {
"type": "movie",
"tag": "热门",
"sort": "recommend",
"page_limit": "20",
"page_start": page_start
}
response = requests.get(url=url,headers=headers, params=params)
json格式数据解码转码
content=response.content
#results = response.json()
json_str=content.decode('utf-8')
data=json.loads(json_str)
读入CSV文件
with open('limit.csv',mode='r',encoding='gbk',newline='') as f:
reader = csv.reader(f)
for row in reader:
......
或者
f=open ('result.csv',mode='r',encoding='gbk',newline='')
reader = csv.reader(f)
......
f.close()
写入CSV文件
with open('filename','w',encoding='utf-8',newline='') as f:
writer=csv.writer(f)
title = ['title1', 'title2', 'title3','title4']
writer.writerow(title)
for da in data['result']:
row=[x1. x2, x3, x4]
writer.writerow(row)
或者
f=open ('result.csv',mode='w',encoding='gbk',newline='')
writer=csv.writer(f)
title = ['title1', 'title2', 'title3','title4']
writer.writerow(title)
for da in data['result']:
row=[x1. x2, x3, x4]
writer.writerow(row)
f.close()