笔记
爬虫步骤
- 给url发送请求
requests.get('http://www.baidu.com/')
-
解析数据
正则模块
import re
re.findall('<a href="video_(.*?)"', response.text, re.S)
- 保存数据
GET请求
访问知乎发现
请求url:
https://www.zhihu.com/explore
请求方式:
GET
请求头:
user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36
cookie:
-
携带请求头参数访问url
请求头字典 headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36' } 在get请求内,添加user-agent requests.get(url='https://www.zhihu.com/explore', headers=headers)
-
params请求参数
from urllib.parse import urlencode 在get方法中添加params参数 requests.get(url, headers=headers, params={"wd": "安徽工程大学"})
-
携带登录cookies破解GitHub登录验证
requests.get(url, headers=headers, cookies=cookies)
爬取豆瓣TOP250电影信息
import requests
import re
# https://movie.douban.com/top250?start=25&filter=
url = 'https://movie.douban.com/top250'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
'Cookie': 'll="118184"; bid=RQx3OC4pgRw; __yadk_uid=uyWZlcX7iVPY2RX2iAJHC6UrYpUiCXaj; trc_cookie_storage=taboola%2520global%253Auser-id%3D4c314db0-2270-427c-844a-4d24d7d044ea-tuct3fcd942; _vwo_uuid_v2=D3FA51A56E43574A050E5ABE014A5CEE7|c3ab98e87d1bf12a4db8dd6658651408; __gads=ID=2ada1d4d203efdcb:T=1560499175:S=ALNI_MafBfzYXuFgJMOF-gXD8EJd_tN_Kg; __utmz=30149280.1560523654.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=30149280.584157489.1560523654.1560523654.1560776436.2; __utmc=30149280; __utmb=30149280.1.10.1560776436; ap_v=0,6.0; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1560776439%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; __utma=223695111.1432634203.1560523654.1560523654.1560776439.2; __utmb=223695111.0.10.1560776439; __utmc=223695111; __utmz=223695111.1560776439.2.2.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_id.100001.4cf6=a9fb23d60dbb7dea.1560499117.3.1560776923.1560524003.'
}
# 返回相应
num = 0
for i in range(10):
response = requests.get(url=f'https://movie.douban.com/top250?start={num}', headers=headers)
print(response.status_code)
# print(response.text)
# 解析
move_list = re.findall('<div class="item">.*?href="(.*?)">.*?alt="(.*?)" src="(.*?)" class="">', response.text,
re.S)
print(move_list)
with open('move_list.txt', 'a', encoding='utf-8') as f:
for line in move_list:
f.write(f'电影名:{line[1]} 电影详情页:{line[0]} 电影图:{line[2]}\n')
num += 25