要求:爬取猫眼电影中影片名称、主演、上映时间、上映地区、评分等信息
1.导入相关包
import requests import json, time import re from requests.exceptions import RequestException
2.抓取首页信息
def get_one_page(url): try: headers = { 'user-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.89 Safari/537.36' } response = requests.get(url, headers = headers) if response.status_code == 200: return response.text except RequestException: return None
3.正则提取信息
def get_head(html): results = re.findall('<title>(.*?)</title>', html, re.S) for result in results: print(result.strip()) def parse_one_page(html): results = re.findall('<dd>.*?board-index.*?>(.*?)</i>.*?data-src="(http.*?)".*?>.*?title="(.*?)".*?star">.*?([\u4e00-\u9fa5]+.*?)\n.*?</p>.*?>(.*?)</p>.*?class="integer">(.*?)</i.*?class="fraction">(./?)</i>', html, re.S) for result in results: result = list(result) yield { 'index': result[0], 'image': result[1], 'title': result[2], 'actor': result[3], 'time': result[4], 'score': result[5]+result[6] }
4.保存到文件
def write_to_file(content): with open('result.txt', 'a', encoding="utf-8") as f: print(type(json.dumps(content)))
f.write(json.dumps(content, ensure_ascii=False)+'\n')
5.整合代码
def main(offset): url = 'http://maoyan.com/board/4?offset=' + str(offset) html = get_one_page(url) get_head(html) for e in parse_one_page(html): write_to_file(e)
6.测试
if __name__ == '__main__': for i in range(10): main(offset = i * 10) time.sleep(1)