import requests
from requests.exceptions import RequestException
import re
import json
from multiprocessing import Pool
import os
from hashlib import md5
def get_one_page(url):
user_agent = "Mozilla"
headers = {"User-Agent": user_agent} # 将爬虫隐藏为Mac用户(固定格式)
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def parse_one_page(html):
pattern = re.compile(r'<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name.*?title="(.*?)".*?star">(.*?)'
'</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
items = re.findall(pattern, html)
for item in items:
yield {
'排名': item[0],
'图片': item[1],
'标题': item[2],
'演员': item[3].strip()[3:],
'时间': item[4].strip()[5:],
'评分': item[5] + item[6]
}
def write_to_file(content):
with open("maoyan_top_1oo.txt", 'a', encoding='utf-8') as f:
f.write(json.dumps(content, ensure_ascii=False) + '\n') # 字典转字符串
f.close()
def download_images(url):
try:
response = requests.get(url)
if response.status_code == 200:
print("正在下载" + url)
save_images(response.content)
return None
except RequestException:
print(url + "下载失败")
def save_images(content):
file_path = '{0}/{1}.{2}'.format("F:/anaconda/猫眼_top100", md5(content).hexdigest(), "jpg")
if not os.path.exists(file_path):
with open(file_path, "wb") as f:
f.write(content)
f.close()
def main(offset):
url = "http://maoyan.com/board/4?offset=" + str(offset)
html = get_one_page(url)
for item in parse_one_page(html):
print(item)
write_to_file(item)
download_images(item["图片"])
if __name__ == "__main__":
'''
for i in range(10):
main(i * 10)
'''
pool = Pool()
pool.map(main, [i*10 for i in range(10)])
pool.close()
pool.join() # 主进程阻塞等待子进程的退出
print("爬取结束!")