笔记
请求库:
requests
selenium
解析库:
re正则
bs4(BeautifulSoup4)
Xpath
存储库:
MongoDB
爬多个视频
# 爬虫
# 发送请求
def get_page(url):
response = requests.get(url)
return response
# 解析数据
def parse_index(text):
res = re.findall('video_(.*?)"',text,re.S)
# print(res)
detail_url_list = []
for m_id in res:
# 拼接详情页
detail_url = 'http://www.pearvideo.com/video_' + m_id
detail_url_list.append(detail_url)
return detail_url_list
# 解析详情页获取视频url
def parse_detail(text):
movie_url = re.findall('srcUrl"(.*?)"',text,re.S)[0]
return movie_url
# 保存数据
def save_movie(movie_url):
response = requests.get(movie_url)
# 把视频写到本地
with open(f'{uuid.uuid4()}.mp4','wb') as f:
f.write(response.content)
f.flush()
if __name__ == '__main__':
# 对主页发送请求
index_res = get_page(url='http://www.pearvideo.com/')
# 对主页进行解析、获取详情页id
detail_url_list = parse_index(response.text)
# 对每个详情页url发送请求
for detail_url in detail_url_list:
detail_res = get_page(url=detail_url)
# 解析详情页获取视频url
movie_url = parse_detail(detail_res.text)
print(movie_url)
# 保存视频
save_movie(movie_url)
requests模块详细使用
# 访问知乎发现
# 请求url:
# https://www.zhihu.com/explore
# 请求方式:
# GET
# 请求头:
# user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36
# 携带请求头参数访问知乎:
import requests
# 请求头字典
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
}
在get请求内,添加user-agent
response = requests.get(url='https://www.zhihu.com/explore',headers=headers)
print(response.status_code)
print(response.text)
with open('zhuhu.html','w',encoding='utf-8') as f:
f.write(response.text)
params
params参数
url = 'https://www.baidu.com/s?'
# print(url)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
}
response = requests.get(url,headers=headers,params={'wd':'klay'})
with open('curry.html','w',encoding='utf-8') as f:
f.write(response.t
selenium模块
爬虫框架
作业
豆瓣影评top250电影信息
电影名称
电影链接
图片链接
简介
导演信息
简介