目标:
利用requests库和正则表达式爬取猫眼电影TOP100的电影名称,上映时间,评分,图片等信息。
抓取目标站点:
url = https://maoyan.com/board/4,如下图所示:排名第一的是《活着》,页面中显示了电影的图片,名称,上映时间,主演,上映地区以及评分等信息。从下图我们可以看出,页面最下面有分页列表,当点击不同的页面时,可以看到url后面的offset参数变化。这是一个偏移量参数,如果offset=n,显示的电影序号为n+1到n+10,每页显示十部电影,一共有十个分页列表。
抓取第一页:
我们先定义get_one_page()方法,传入url参数,再通过main()方法调用,获取到网页的源代码。代码如下:
import requests
def get_one_page(url):
headers = {
'Host': 'maoyan.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
'Cookie': '__mta=145136523.1594209317667.1594747421688.1594778393662.21; uuid_n_v=v1; uuid=E4462AF0C11111EA80CEB10117801C923B7DA80286224238B09241A4AFFAC697; mojo-uuid=d8c215e33053ca204328716ca1d0bac5; _lxsdk_cuid=1732e47b2bbc8-0187393c9c62d8-4353760-100200-1732e47b2bbc8; _lxsdk=E4462AF0C11111EA80CEB10117801C923B7DA80286224238B09241A4AFFAC697; _csrf=f956ce77d0b62dbc4eb84120e066306315999c607a7ed99c375e22ae83b95b07; mojo-session-id={"id":"a2fda4353ffaf46ee5e5608176322ae8","time":1594778370113}; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1594209318,1594741880,1594778370; __mta=145136523.1594209317667.1594747421688.1594778374530.21; mojo-trace-id=4; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1594778393; _lxsdk_s=1735032c070-ff-715-12f%7C%7C7',
}
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
else:
return None
def main():
url = 'https://maoyan.com/board'
html = get_one_page(url)
print(html)
if __name__ == '__main__':
main()
部分结果:
<dd>
<i class="board-index board-index-3">3</i>
<a href="/films/416" title="盗梦空间" class="image-link" data-act="boarditem-click" data-val="{movieId:416}">
<img src="//s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:5788b470/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
<img data-src="https://p1.meituan.net/movie/d40efe1183f29d5900f5c60be3c8a89d339225.jpg@160w_220h_1e_1c" alt="盗梦空间" class="board-img" />
</a>
<div class="board-item-main">
<div class="board-item-content">
<div class="movie-item-info">
<p class="name"><a href="/films/416" title="盗梦空间" data-act="boarditem-click" data-val="{movieId:416}">盗梦空间</a></p>
<p class="star">
主演:莱昂纳多·迪卡普里奥,渡边谦,约瑟夫·高登-莱维特
</p>
</div>
</div>
</dd>
用正则表达式提取所要信息:
import requests
import re
def get_one_page(url):
headers = {
'Host': 'maoyan.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
'Cookie': '__mta=145136523.1594209317667.1594747421688.1594778393662.21; uuid_n_v=v1; uuid=E4462AF0C11111EA80CEB10117801C923B7DA80286224238B09241A4AFFAC697; mojo-uuid=d8c215e33053ca204328716ca1d0bac5; _lxsdk_cuid=1732e47b2bbc8-0187393c9c62d8-4353760-100200-1732e47b2bbc8; _lxsdk=E4462AF0C11111EA80CEB10117801C923B7DA80286224238B09241A4AFFAC697; _csrf=f956ce77d0b62dbc4eb84120e066306315999c607a7ed99c375e22ae83b95b07; mojo-session-id={"id":"a2fda4353ffaf46ee5e5608176322ae8","time":1594778370113}; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1594209318,1594741880,1594778370; __mta=145136523.1594209317667.1594747421688.1594778374530.21; mojo-trace-id=4; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1594778393; _lxsdk_s=1735032c070-ff-715-12f%7C%7C7',
}
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
else:
return None
def parse_one_page(html):
pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a.*?>(.*?)</a>.*?star">'
+ '(.*?)</p>.*?releasetime">(.*?)</p>'
+ '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
itmes = re.findall(pattern,html)
print(itmes)
def main():
url = 'https://maoyan.com/board'
html = get_one_page(url)
parse_one_page(html)
if __name__ == '__main__':
main()
部分结果:
[(‘1’, ‘https://p0.meituan.net/moviemachine/b2c5c74d33e45745fd3462e44b3698e18336620.jpg@160w_220h_1e_1c’, ‘我和我的祖国’, '\n 主演:黄渤,张译,韩昊霖\n ', ‘上映时间:2019-09-30’, ‘9.’, ‘7’), (‘2’, ‘https://p0.meituan.net/movie/a3d6ca3bdd5b0ddd7016acff9a9f2f2e2805813.jpg@160w_220h_1e_1c’, ‘叶问4:完结篇’, '\n 主演:甄子丹,吴樾,吴建豪\n ', ‘上映时间:2019-12-20’, ‘9.’, ‘4’), (‘3’, ‘https://p1.meituan.net/movie/d40efe1183f29d5900f5c60be3c8a89d339225.jpg@160w_220h_1e_1c’, ‘盗梦空间’, '\n 主演:莱昂纳多·迪卡普里奥,渡边谦,约瑟夫·高登-莱维特\n ', ‘上映时间:2010-09-01’, ‘9.’, ‘2’)
此时提取到的信息返回的是元组的形式,比较杂乱,我们将其赋值为字典,形成格式化数据。修改代码如下:
import requests
import re
def get_one_page(url):
headers = {
'Host': 'maoyan.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
'Cookie': '__mta=145136523.1594209317667.1594747421688.1594778393662.21; uuid_n_v=v1; uuid=E4462AF0C11111EA80CEB10117801C923B7DA80286224238B09241A4AFFAC697; mojo-uuid=d8c215e33053ca204328716ca1d0bac5; _lxsdk_cuid=1732e47b2bbc8-0187393c9c62d8-4353760-100200-1732e47b2bbc8; _lxsdk=E4462AF0C11111EA80CEB10117801C923B7DA80286224238B09241A4AFFAC697; _csrf=f956ce77d0b62dbc4eb84120e066306315999c607a7ed99c375e22ae83b95b07; mojo-session-id={"id":"a2fda4353ffaf46ee5e5608176322ae8","time":1594778370113}; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1594209318,1594741880,1594778370; __mta=145136523.1594209317667.1594747421688.1594778374530.21; mojo-trace-id=4; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1594778393; _lxsdk_s=1735032c070-ff-715-12f%7C%7C7',
}
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
else:
return None
def parse_one_page(html):
pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a.*?>(.*?)</a>.*?star">'
+ '(.*?)</p>.*?releasetime">(.*?)</p>'
+ '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
itmes = re.findall(pattern,html)
for itme in itmes:
yield {
'index':itme[0],
'image':itme[1],
'title':itme[2].strip(),
'actor':itme[3].strip()[3:],
'time':itme[4].strip()[5:],
'score':itme[5]+itme[6]
}
def main():
url = 'https://maoyan.com/board'
html = get_one_page(url)
for itme in parse_one_page(html):
print(itme)
if __name__ == '__main__':
main()
结果:
{‘index’: ‘1’, ‘image’: ‘https://p0.meituan.net/moviemachine/b2c5c74d33e45745fd3462e44b3698e18336620.jpg@160w_220h_1e_1c’, ‘title’: ‘我和我的祖国’, ‘actor’: ‘黄渤,张译,韩昊霖’, ‘time’: ‘2019-09-30’, ‘score’: ‘9.7’}
{‘index’: ‘2’, ‘image’: ‘https://p0.meituan.net/movie/a3d6ca3bdd5b0ddd7016acff9a9f2f2e2805813.jpg@160w_220h_1e_1c’, ‘title’: ‘叶问4:完结篇’, ‘actor’: ‘甄子丹,吴樾,吴建豪’, ‘time’: ‘2019-12-20’, ‘score’: ‘9.4’}
{‘index’: ‘3’, ‘image’: ‘https://p1.meituan.net/movie/d40efe1183f29d5900f5c60be3c8a89d339225.jpg@160w_220h_1e_1c’, ‘title’: ‘盗梦空间’, ‘actor’: ‘莱昂纳多·迪卡普里奥,渡边谦,约瑟夫·高登-莱维特’, ‘time’: ‘2010-09-01’, ‘score’: ‘9.2’}
{‘index’: ‘4’, ‘image’: ‘https://p0.meituan.net/movie/79182a20224ebe1751b2d8980420cf21149653.jpg@160w_220h_1e_1c’, ‘title’: ‘捉妖记’, ‘actor’: ‘白百何,井柏然,姜武’, ‘time’: ‘2015-07-16’, ‘score’: ‘9.1’}
{‘index’: ‘5’, ‘image’: ‘https://p0.meituan.net/moviemachine/36eda496688542263d9a0f02ac728327823369.jpg@160w_220h_1e_1c’, ‘title’: ‘宠爱’, ‘actor’: ‘于和伟,吴磊,张子枫’, ‘time’: ‘2019-12-31’, ‘score’: ‘8.9’}
{‘index’: ‘6’, ‘image’: ‘https://p0.meituan.net/movie/db82e48befc1c7e2e98425165b2500d76222872.jpg@160w_220h_1e_1c’, ‘title’: ‘为国而歌’, ‘actor’: ‘王雷,古力娜扎,海一天’, ‘time’: ‘2019-10-18’, ‘score’: ‘9.0’}
{‘index’: ‘7’, ‘image’: ‘https://p1.meituan.net/movie/bbece4ae764b9b9661655e54ce0347d6276411.jpg@160w_220h_1e_1c’, ‘title’: ‘西游·降魔篇’, ‘actor’: ‘舒淇,文章,黄渤’, ‘time’: ‘2013-02-10’, ‘score’: ‘8.4’}
{‘index’: ‘8’, ‘image’: ‘https://p0.meituan.net/movie/716fd3706814984a384be50e9c6d3058187208.jpg@160w_220h_1e_1c’, ‘title’: ‘不期而遇’, ‘actor’: ‘张雨绮,张亮,王少雍’, ‘time’: ‘2017-05-19’, ‘score’: ‘7.9’}
{‘index’: ‘9’, ‘image’: ‘https://p1.meituan.net/movie/9e6e0bc2f5367214b88fbbadb3d85145486469.jpg@160w_220h_1e_1c’, ‘title’: ‘美容针’, ‘actor’: ‘闫妮,杜天皓,宋伊人’, ‘time’: ‘2017-05-20’, ‘score’: ‘7.9’}
将提取的第一页信息写入文件:
import json
import requests
import re
def get_one_page(url):
headers = {
'Host': 'maoyan.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
'Cookie': '__mta=145136523.1594209317667.1594747421688.1594778393662.21; uuid_n_v=v1; uuid=E4462AF0C11111EA80CEB10117801C923B7DA80286224238B09241A4AFFAC697; mojo-uuid=d8c215e33053ca204328716ca1d0bac5; _lxsdk_cuid=1732e47b2bbc8-0187393c9c62d8-4353760-100200-1732e47b2bbc8; _lxsdk=E4462AF0C11111EA80CEB10117801C923B7DA80286224238B09241A4AFFAC697; _csrf=f956ce77d0b62dbc4eb84120e066306315999c607a7ed99c375e22ae83b95b07; mojo-session-id={"id":"a2fda4353ffaf46ee5e5608176322ae8","time":1594778370113}; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1594209318,1594741880,1594778370; __mta=145136523.1594209317667.1594747421688.1594778374530.21; mojo-trace-id=4; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1594778393; _lxsdk_s=1735032c070-ff-715-12f%7C%7C7',
}
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
else:
return None
def parse_one_page(html):
pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a.*?>(.*?)</a>.*?star">'
+ '(.*?)</p>.*?releasetime">(.*?)</p>'
+ '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
itmes = re.findall(pattern,html)
for itme in itmes:
yield {
'index':itme[0],
'image':itme[1],
'title':itme[2].strip(),
'actor':itme[3].strip()[3:],
'time':itme[4].strip()[5:],
'score':itme[5]+itme[6]
}
def write_to_file(content):
with open('fsults.txt','a',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii=False)+'\n')
f.close()
def main():
url = 'https://maoyan.com/board/4'
html = get_one_page(url)
for item in parse_one_page(html):
print(item)
write_to_file(item)
if __name__ == '__main__':
main()
结果:
爬取十页信息(非多进程):
import json
import requests
import re
from requests.exceptions import RequestException
def get_one_page(url):
headers = {
'Host': 'maoyan.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
'Cookie': '__mta=145136523.1594209317667.1594747421688.1594778393662.21; uuid_n_v=v1; uuid=E4462AF0C11111EA80CEB10117801C923B7DA80286224238B09241A4AFFAC697; mojo-uuid=d8c215e33053ca204328716ca1d0bac5; _lxsdk_cuid=1732e47b2bbc8-0187393c9c62d8-4353760-100200-1732e47b2bbc8; _lxsdk=E4462AF0C11111EA80CEB10117801C923B7DA80286224238B09241A4AFFAC697; _csrf=f956ce77d0b62dbc4eb84120e066306315999c607a7ed99c375e22ae83b95b07; mojo-session-id={"id":"a2fda4353ffaf46ee5e5608176322ae8","time":1594778370113}; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1594209318,1594741880,1594778370; __mta=145136523.1594209317667.1594747421688.1594778374530.21; mojo-trace-id=4; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1594778393; _lxsdk_s=1735032c070-ff-715-12f%7C%7C7',
}
try:
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
else:
return None
except RequestException:
return None
def parse_one_page(html):
pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a.*?>(.*?)</a>.*?star">'
+ '(.*?)</p>.*?releasetime">(.*?)</p>'
+ '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
items = re.findall(pattern,html)
for item in items:
yield {
'index':item[0],
'image':item[1],
'title':item[2],
'actor':item[3].strip()[3:],
'time':item[4].strip()[5:],
'score':item[5]+item[6]
}
def write_to_file(content):
with open('rasults.txt','a',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii=False)+'\n')
f.close()
def main(offset):
url = 'https://maoyan.com/board/4?offset='+str(offset)
html = get_one_page(url)
for item in parse_one_page(html):
print(item)
write_to_file(item)
if __name__ == '__main__':
for i in range(10):
main(i*10)
爬取十页信息(多进程秒抓):
import json
from multiprocessing import Pool#引入一个进程池
import requests
import re
from requests.exceptions import RequestException
def get_one_page(url):
headers = {
'Host': 'maoyan.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
'Cookie': '__mta=145136523.1594209317667.1594747421688.1594778393662.21; uuid_n_v=v1; uuid=E4462AF0C11111EA80CEB10117801C923B7DA80286224238B09241A4AFFAC697; mojo-uuid=d8c215e33053ca204328716ca1d0bac5; _lxsdk_cuid=1732e47b2bbc8-0187393c9c62d8-4353760-100200-1732e47b2bbc8; _lxsdk=E4462AF0C11111EA80CEB10117801C923B7DA80286224238B09241A4AFFAC697; _csrf=f956ce77d0b62dbc4eb84120e066306315999c607a7ed99c375e22ae83b95b07; mojo-session-id={"id":"a2fda4353ffaf46ee5e5608176322ae8","time":1594778370113}; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1594209318,1594741880,1594778370; __mta=145136523.1594209317667.1594747421688.1594778374530.21; mojo-trace-id=4; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1594778393; _lxsdk_s=1735032c070-ff-715-12f%7C%7C7',
}
try:
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
else:
return None
except RequestException:
return None
def parse_one_page(html):
pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a.*?>(.*?)</a>.*?star">'
+ '(.*?)</p>.*?releasetime">(.*?)</p>'
+ '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
items = re.findall(pattern,html)
for item in items:
yield {
'index':item[0],
'image':item[1],
'title':item[2],
'actor':item[3].strip()[3:],
'time':item[4].strip()[5:],
'score':item[5]+item[6]
}
def write_to_file(content):
with open('rasults.txt','a',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii=False)+'\n')
f.close()
def main(offset):
url = 'https://maoyan.com/board/4?offset='+str(offset)
html = get_one_page(url)
for item in parse_one_page(html):
print(item)
write_to_file(item)
if __name__ == '__main__':
# for i in range(10):
# main(i*10)
pool = Pool()
pool.map(main,[i*10 for i in range(10)])
结果: