#猫眼电影TOP100榜单信息爬取
import requests
import re
import time
def get_one_page(url):
cookies = {
'__mta': '156120324.1578214071605.1578214262484.1578214360267.11',
'_lxsdk_cuid': '1656faea28ac8-0e4f93e1e1caa8-5d4e211f-1fa400-1656faea28a86',
'uuid_n_v': 'v1',
'uuid': '0E256E502F9811EA949207B5279E2303CAFB77B8C3904641BBCC3FC1A8745521',
'_csrf': '48e6b9d20b0443fe7e0138e9725776ee69f804dbe595ebcc21931d712decdf8c',
'_lx_utm': 'utm_source^%^3DBaidu^%^26utm_medium^%^3Dorganic',
'_lxsdk': '0E256E502F9811EA949207B5279E2303CAFB77B8C3904641BBCC3FC1A8745521',
'Hm_lvt_703e94591e87be68cc8da0da7cbd0be2': '1578214071',
'mojo-uuid': '78b1c01903b485c55f08d94e3ff8b27c',
'mojo-session-id': '^{^\\^id^\\^:^\\^59044f06f7301c844f0548ca120d3396^\\^,^\\^time^\\^:1578214071659^}',
'Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2': '1578214360',
'mojo-trace-id': '21',
'_lxsdk_s': '16f74e39b69-605-656-ac6^%^7C^%^7C24',
}
headers = {
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Referer': 'https://maoyan.com/board/4?offset=80',
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
}
response = requests.get(url , headers=headers, cookies=cookies)
if response.status_code == 200:
return response.text
return None
# 用正则表达式选取需要的信息
# 排名信息
# result_top = re.findall('board-index board-index.*?">(.*?)',html)
# print(result_top)
# 电影图片
# result_img = re.findall('src="(.*?)@.*?board-img' ,html)
# print(result_img)
# # 电影名称
# result_move_nm = re.findall('"name".*?"boarditem-click".*?>(.*?)<', html)
# print(result_move_nm)
# # 主演
# result_star = re.findall('<.*?"star">(.*?)<',html,re.S)
# print(result_star)
# # 上映时间
# result_time = re.findall('"releasetime">(.*?)<',html,re.S)
# print(result_time)
# 评分
# html = get_one_page('https://maoyan.com/board/4?offset=0')
# result_score = re.findall('score.*?integer">(.*?)<.*?fraction">(.*?)<',html,re.S)
# print(result_score)
#解析第一页源码
def parse_one_page(html):
# re.S代表取消换行;
# 排名的正则表达式是<i.*?board-index.*">(.*?)
# 电影图片的正则表达式是<img data-src="(.*?)@160w_220h_1e_1c" .*?="" class="board-img">
# re.compile(
# '<i.*?board-index.*">(.*?).*?<img data-src="(.*?)@160w_220h_1e_1c" .*?="" class="board-img">'
# ,re.S)会导致无法正确匹配排名信息和图片匹配,代码会自动进行贪婪匹配;因此要在有.*的地方加上?
#compile()方法的用法类似一个模型,即可以在后面的匹配中复用
pattern = re.compile(
'board-index board-index.*?">(.*?).*?data-src="(.*?)@.*?board-img.*?"name".*?"boarditem-click".*?>(.*?)<.*?<.*?"star">(.*?)<.*?"releasetime">(.*?)<.*?score.*?integer">(.*?)<.*?fraction">(.*?)<'
,re.S)
items = re.findall(pattern,html)
all_items = ''
for item in items:
# 主演输出时有\n需要修改格式,用strip()方法这个方法只能给str用
message = item[0]+'\t'+item[1]+'\t'+item[2]+'\t'+str(item[3]).strip()+'\t'+item[4]+'\t'+item[5]+item[6]
# all_items = all_items+message
all_items += '\n'+message
return all_items
def write_txt(text):
with open('榜单.txt','a',encoding='utf-8') as f:
f.write(text)
url_prefix = 'https://maoyan.com/board/4?offset='
for offset in range(0, 20, 10):
url = url_prefix+str(offset)
html_source = get_one_page(url)
content = parse_one_page(html_source)
print(content)
write_txt(content)
time.sleep(3)</i.*?board-index.*"></i.*?board-index.*">
以上是在王老师的帮助下完成的人生第一爬,特此感谢王老师不厌其烦的教导。本案例中主要使用正则表达式来匹配信息,目的是熟悉它的用法;光写匹配信息那里我就来来回回修改了五六遍,多次尝试才使他变得稍微短了些。
另外我认为函数的概念对于非常轴、很轴的我而言还是有难度的。其次需要注意的地方就是网页的链接。榜单1-10页的链接是有所区别有有规律的,offset=0,10,20… 是需要额外设置的。
改文章主要用于学习使用,欢迎各位大佬指导