作业(爬豆瓣250)
'''''' ''' 爬取豆瓣TOP250电影信息 主页: 第一页: https://movie.douban.com/top250?start=0&filter= 第二页: https://movie.douban.com/top250?start=25&filter= 第三页: https://movie.douban.com/top250?start=50&filter= 第四页: https://movie.douban.com/top250?start=75&filter= 第十页: https://movie.douban.com/top250?start=225&filter= GET User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36 re正则: # 电影详情页url、图片链接、电影名称、导演、主演、电影上映时间、电影评分、评价人数、简介 <div class="item">.*?href="(.*?)">.*?src="(.*?)" class="">.*?<span class="title">(.*?)</span>.*?<div class="bd">.*?导演:(.*?)<br>(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价.*?<span class="inq">(.*?)</span> ''' import requests import re headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36' } base_url = 'https://movie.douban.com/top250?start={}&filter=' n = 0 for line in range(10): url = base_url.format(n) print(type(n)) n += 25 print(url) # 1、往豆瓣TOP250发送请求获取响应数据 response = requests.get(url, headers=headers) # print(response.text) # 2、通过正则解析提取数据 # 电影详情页url、图片链接、电影名称、电影评分、评价人数 movie_content_list = re.findall( # 正则规则 # '<div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价', '<div class="item">.*?href="(.*?)">.*?src="(.*?)" class="">.*?<span class="title">(.*?)</span>.*?<div class="bd">.*?导演:(.*?)<br>(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价.*?<span class="inq">(.*?)</span>', # 解析文本 response.text, # 匹配模式 re.S) for movie_content in movie_content_list: # 解压赋值每一部电影 detail_url, movie_jpg, name, daoyan, timer, point, num, desc = movie_content data = f'电影名称:{name}, 详情页url:{detail_url}, 图片url:{movie_jpg}, 导演: {daoyan} 上映时间: {timer}评分: {point}, 评价人数: {num} 简介:{desc}\n' print(data) # 3、保存数据,把电影信息写入文件中 with open('douban.txt', 'a', encoding='utf-8') as f: f.write(data)
request模块详细使用
#requests详细使用 #对梨视频详细页发送请求,获取响应数据 ''' #爬取梨视频 请求url: Request URL: https://www.pearvideo.com/ 请求方式: get 请求头: user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36 ''' import requests import re #正则模块 #uuid.uuid4()根据时间戳随机生成一段字符串 import uuid #爬虫三部曲 #1、发送请求 def get_page(url): response = requests.get(url) return response #2、解析主页数据 #解析主页获取视频详细页ID def parse_index(text): #re.findall('正则匹配规则','解析文本','正则模式') #re.S(全局模式 对整个文本进行匹配) #.指当前位置 #*指查找所有 res = re.findall('a href="video_(.*?)"',text,re.S) #print(res) detail_url_list = [] for m_id in res: #拼接详情页url detail_url = 'https://www.pearvideo.com/video_'+m_id #print(detail_url) detail_url_list.append(detail_url) print(detail_url_list) return detail_url_list #解析详情页获取视频url def parse_detail(text): ''' (.*?):提取括号的类容 .*? :直接匹配 ''' movie_url = re.findall('srcUrl="(.*?)"',text,re.S)[0] return movie_url #3、保存数据 def save_movie(movie_url): response = requests.get(movie_url) #把视频写入本地 with open(f'{uuid.uuid4()}.mp4','wb')as f: f.write(response.content) f.flush() if __name__=='__main__': #1、对主页发送请求 index_res = get_page(url='https://www.pearvideo.com/') #2、对主页进行解析,获取详情页ID detail_url_list = parse_index(index_res.text) #3、对每个详情页url发送请求 for detail_url in detail_url_list: detail_res = get_page(url=detail_url) #4、解析详情页获取视频url movie_url = parse_detail(detail_res.text) print(movie_url) #5、保存视频 save_movie(movie_url)
#访问知乎发现
'''
Request URL: https://www.zhihu.com/explore
Request Method: GET
user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36
'''
#访问知乎
# import requests
# response = requests.get(url='https://www.zhihu.com/explore')
# print(response.status_code)
# print(response.text)
# #携带请求头参数访问知乎
#
# import requests
#
# #请求头字典
# headers = {
# 'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
# }
#
# #在get请求内添加headers
# response = requests.get(url='https://www.zhihu.com/explore',headers = headers)
# print(response.status_code)
# # print(response.text)
#
# with open('zhihu.html','w',encoding='utf-8')as f:
# f.write(response.text)
'''
携带cookie
携带登录cookie破解博客园登录验证
请求url:
Request URL: https://home.cnblogs.com/set/
Request Method: GET
请求头
user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36
cookie:_ga=GA1.2.897792622.1560415926; _gid=GA1.2.2097222173.1560415926; __gads=ID=90a50b775a32932c:T=1560415924:S=ALNI_MbW1TkLegg86CkkukfahBA7y3wgnw; .Cnblogs.AspNetCore.Cookies=CfDJ8D8Q4oM3DPZMgpKI1MnYlrkt64Ro4vuztDN4I9mCKsOpo66KpTVpoyHTc3VgN3z62nZN14QzEMAPB1O5u7etKJDD6uU9zOhearKCZDYU7iDhDQucYSDobytq6uMDPmc3zFfgfMCs1UB5EdhPnpGGFWrZHCXZbLnLOsrdf98km6FgjfWqfVjKIzxzGq8NZTXmwXie-musLJnFRtnCqc5UsdseCokD6Ea1nMnI57Qa8V-rVLWfrzNhoMAwa6C68qe_I4wyRGRmSORbT2UE6JzNEtgkEftTqU0ZbhpBSw0K7rzwymnI8PYt6CmVq1GpKy6Xiz-cYxkcyUPCD7YUvWTy-E7O9C81fONFS50KTEtJgjHKcioDKlivHKKeKlj62Qso5ITIP9rGbRtE-aajWrkqap5Phifm9T96hnTjWf3heC0ihwOevA4Ywa8EmTky0xYz_6D53Kflvmp3peXI4g-67bo; .CNBlogsCookie=A19E10B88B12CA6DE89A7CDD8BD2178DC8114CDB3ECECB0F5174F7F0031898EEDC0DD2DD35656021DB87E62104F7C96CF6CECE773B09298826E467457E303F9B52A44B14CC964C0BDDEBF2F3578618F45F2CFD20
'''
import requests
#请求url
url = 'https://home.cnblogs.com/set/'
#请求头
# headers = {
# 'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
#
# #在请求头中拼接cookie
# 'cookie':'_ga=GA1.2.897792622.1560415926; _gid=GA1.2.2097222173.1560415926; __gads=ID=90a50b775a32932c:T=1560415924:S=ALNI_MbW1TkLegg86CkkukfahBA7y3wgnw; .Cnblogs.AspNetCore.Cookies=CfDJ8D8Q4oM3DPZMgpKI1MnYlrkt64Ro4vuztDN4I9mCKsOpo66KpTVpoyHTc3VgN3z62nZN14QzEMAPB1O5u7etKJDD6uU9zOhearKCZDYU7iDhDQucYSDobytq6uMDPmc3zFfgfMCs1UB5EdhPnpGGFWrZHCXZbLnLOsrdf98km6FgjfWqfVjKIzxzGq8NZTXmwXie-musLJnFRtnCqc5UsdseCokD6Ea1nMnI57Qa8V-rVLWfrzNhoMAwa6C68qe_I4wyRGRmSORbT2UE6JzNEtgkEftTqU0ZbhpBSw0K7rzwymnI8PYt6CmVq1GpKy6Xiz-cYxkcyUPCD7YUvWTy-E7O9C81fONFS50KTEtJgjHKcioDKlivHKKeKlj62Qso5ITIP9rGbRtE-aajWrkqap5Phifm9T96hnTjWf3heC0ihwOevA4Ywa8EmTky0xYz_6D53Kflvmp3peXI4g-67bo; .CNBlogsCookie=A19E10B88B12CA6DE89A7CDD8BD2178DC8114CDB3ECECB0F5174F7F0031898EEDC0DD2DD35656021DB87E62104F7C96CF6CECE773B09298826E467457E303F9B52A44B14CC964C0BDDEBF2F3578618F45F2CFD20'
# }
# boke_res = requests.get(url,headers=headers)
# headers = {
# 'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
#
#
# 'cookie':'_ga=GA1.2.897792622.1560415926; _gid=GA1.2.2097222173.1560415926; __gads=ID=90a50b775a32932c:T=1560415924:S=ALNI_MbW1TkLegg86CkkukfahBA7y3wgnw; .Cnblogs.AspNetCore.Cookies=CfDJ8D8Q4oM3DPZMgpKI1MnYlrkt64Ro4vuztDN4I9mCKsOpo66KpTVpoyHTc3VgN3z62nZN14QzEMAPB1O5u7etKJDD6uU9zOhearKCZDYU7iDhDQucYSDobytq6uMDPmc3zFfgfMCs1UB5EdhPnpGGFWrZHCXZbLnLOsrdf98km6FgjfWqfVjKIzxzGq8NZTXmwXie-musLJnFRtnCqc5UsdseCokD6Ea1nMnI57Qa8V-rVLWfrzNhoMAwa6C68qe_I4wyRGRmSORbT2UE6JzNEtgkEftTqU0ZbhpBSw0K7rzwymnI8PYt6CmVq1GpKy6Xiz-cYxkcyUPCD7YUvWTy-E7O9C81fONFS50KTEtJgjHKcioDKlivHKKeKlj62Qso5ITIP9rGbRtE-aajWrkqap5Phifm9T96hnTjWf3heC0ihwOevA4Ywa8EmTky0xYz_6D53Kflvmp3peXI4g-67bo; .CNBlogsCookie=A19E10B88B12CA6DE89A7CDD8BD2178DC8114CDB3ECECB0F5174F7F0031898EEDC0DD2DD35656021DB87E62104F7C96CF6CECE773B09298826E467457E303F9B52A44B14CC964C0BDDEBF2F3578618F45F2CFD20'
#
#
# print('1277886541@qq.com'in boke_res.text)