python实训成果_python实训第四天

# 爬虫全过程:

# 1.发送请求(请求库)

# requests

# selenium

# 2.获取相应数据(服务器返回)

# 3.解析并提取数据(解析库)

# re正则

# bs4(beautifulsoup4)

# Xpath

# 4.保存数据(存储库)

# mongDB

# 爬虫框架

# Scrapy(基于面向对象)

# requests模块详细使用

# http协议:

# 请求url:

#

# 请求方式:

# GET

# 请求头:

# Cookie:可能需要关注

# User-Agent:用来证明自己是浏览器

#import requests

# response=requests.get()

# print(response.status_code)

# print(response.text)

# 携带请求头参数访问知乎

# import requests

# # 请求字典

# headers={‘user-agent‘:‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36‘

#

# }

# # 在get请求内,添加user-agent

# response=requests.get(url=‘https://www.zhihu.com/explore‘,headers=headers)

# print(response.status_code)

# with open(‘zhihu.html‘,‘w‘,encoding=‘utf-8‘) as f:

# f.write(response.text)

#

# 携带cookies

# 携带登录cookies破解github登录验证

# 请求url:。。。。

# 请求方式:。。。。

# 请求头:。。。。

import requests

url=‘https://home.cnblogs.com/set/‘

headers={‘user-agent‘:‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36‘,

‘Cookies‘:‘_ga=GA1.2.736039140.1560498371; _gid=GA1.2.393939017.1560498371; __gads=ID=22042d69ef7c440a:T=1560498371:S=ALNI_MZjbBvbmYulhYR0hD7DDAvxO0aolQ; .Cnblogs.AspNetCore.Cookies=CfDJ8D8Q4oM3DPZMgpKI1MnYlrl5aDQbB7qHF12lN377FcJeizO5Dr4IA_1e7Aq8woZhTxdhKDrbe8NA3gDFqxX5fXn7Op4tblZ3WlqCLIBc9yYqTekcG0jfa9xAH-ur9i-QKr9dvFLlxL1TVSknTiV9iA9nxENBL_WJqnpg8Lo7M5DkfKd0hslNAvuFza9WE3InaBkqJom6ThPvt0z-LN0yviYk5duwVIT8HM1tfOHM2KT_ERkPqKSUTgVRKYGKWrMsG89yDtjKBL1lp0IjzQtzIzK0215tgd3fh0guFL2U994D-ZgHTQthJ0ZZErBUrZ3Z2aHMiJnHXVJLWW3NWAlRuk-R4snWbHpJt8diYsfn-P-q79Ms2SmCAKEg8Vqzf41Qb5lYT_qvGWw0vU3uZwglGwb6KycLuTwKVIXYcrrmgR_F5mFa6MnIoylo1ljVhgRROZgBVQz15SMONXFGTpaX8zI; .CNBlogsCookie=A7F62226302E1403835FB5491EFFE521C6FEB4D05375BC64EC3B87D308A75E4372DBFD7E26B197F93A52D7C4212BA3EF74F4A65A51B7CA92266DAA7F0365C3C7FE6BA6294557EF3FB7CAB11990D3E5723D5FEB51; _gat=1‘}

github_res=requests.get(url, headers=headers)

import requests

cookies={‘Cookies‘:‘_ga=GA1.2.736039140.1560498371; _gid=GA1.2.393939017.1560498371; __gads=ID=22042d69ef7c440a:T=1560498371:S=ALNI_MZjbBvbmYulhYR0hD7DDAvxO0aolQ; .Cnblogs.AspNetCore.Cookies=CfDJ8D8Q4oM3DPZMgpKI1MnYlrl5aDQbB7qHF12lN377FcJeizO5Dr4IA_1e7Aq8woZhTxdhKDrbe8NA3gDFqxX5fXn7Op4tblZ3WlqCLIBc9yYqTekcG0jfa9xAH-ur9i-QKr9dvFLlxL1TVSknTiV9iA9nxENBL_WJqnpg8Lo7M5DkfKd0hslNAvuFza9WE3InaBkqJom6ThPvt0z-LN0yviYk5duwVIT8HM1tfOHM2KT_ERkPqKSUTgVRKYGKWrMsG89yDtjKBL1lp0IjzQtzIzK0215tgd3fh0guFL2U994D-ZgHTQthJ0ZZErBUrZ3Z2aHMiJnHXVJLWW3NWAlRuk-R4snWbHpJt8diYsfn-P-q79Ms2SmCAKEg8Vqzf41Qb5lYT_qvGWw0vU3uZwglGwb6KycLuTwKVIXYcrrmgR_F5mFa6MnIoylo1ljVhgRROZgBVQz15SMONXFGTpaX8zI; .CNBlogsCookie=A7F62226302E1403835FB5491EFFE521C6FEB4D05375BC64EC3B87D308A75E4372DBFD7E26B197F93A52D7C4212BA3EF74F4A65A51B7CA92266DAA7F0365C3C7FE6BA6294557EF3FB7CAB11990D3E5723D5FEB51; _gat=1‘}

github_res=requests.get(url,headers=headers,cookies=cookies)

print(‘2545976330@qq.com‘ in github_res.text)

多线程爬虫

# 3.6python解释器 3.7报错shutdown

import requests

import re

import uuid

from concurrent.futures import ThreadPoolExecutor#导入线程池模块

pool=ThreadPoolExecutor(50)#线程池限制50个线程

# 爬虫三部曲

# 1.发送请求

def get_page(url):

print(f‘开始异步任务:{url}‘)

response=requests.get(url)

return response

# 2.解析数据

# 解析主页获取视频详情ID

def parse_index(res):

response=res.result()

id_list = re.findall(‘

for m_id in id_list:

detail_url = ‘https://www.pearvideo.com/video_‘ + m_id

pool.submit(get_page,detail_url).add_done_callback(parse_detail)

# 解析详情页获取视频url

def parse_detail(res):

response=res.result()

movie_url=re.findall(‘srcUrl="(.*?)"‘,response.text,re.S)[0]

pool.submit(get_page, movie_url).add_done_callback(save_movie)

# 保存数据

def save_movie(res):

movie_res=res.result()

with open(f‘{uuid.uuid4()}.mp4‘,‘wb‘)as f:

f.write(movie_res.content)

print(f‘视频下载结束:{movie_res.url}‘)

f.flush()

if __name__ == ‘__main__‘:

url=‘https://www.pearvideo.com/‘

pool.submit(get_page, url).add_done_callback(parse_index)

爬取豆瓣部分电影相关信息

‘‘‘‘‘‘

‘‘‘

主页:

https://movie.douban.com/top250

GET

User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36

re正则:

# 电影详情页url、图片链接、电影名称、电影评分、评价人数

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值