爬去猫眼top100排行榜电影的相关信息:
# 猫眼电影改版了/20171214
# 在ipython,nootebook为什么无法运行multiprocessing模块,一直显示“*”啊啊啊啊啊!python自带的IDE好使啊,改天用pycharm试一下。/20171214
# pycharm里可以运行,但写入文件时为什么又出现了编码错误的问题……/20171219
# 问题解决了,在juypter nootebook里也能实现了,因为juypter的特性决定的,要将code当文件引入,感谢StackOverflow里的大牛:https://stackoverflow.com/questions/47313732/jupyter-notebook-never-finishes-processing-using-multiprocessing-python-3?answertab=oldest#tab-top /20171220
import re import requests from requests.exceptions import RequestException import json from fake_useragent import UserAgent from multiprocessing import Pool def get_one_page(url): try: """ response = requests.get(url) # 以前的代码没有伪装也行,今天试了下不行了,提示被禁止 """ """ # 用自己复制过来的UA headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'} """ # 新发现的用fake_UA包 headers = {'User-Agent': UserAgent().random} response = requests.get(url,headers = headers) if response.status_code == 200: return response.text return None except RequestException: return None def parse_one_page(html): # re.S:匹配换行符 # 匹配:排名,名字,演员,上映时间,评分 pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a' +'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>' +'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S) items = re.findall(pattern, html) # findall以列表形式返回 #print(items) for item in items: # 以迭代器的形式返回字典形式的内容 yield { 'index': item[0], 'image': item[1], 'title': item[2], 'actor': item[3].strip()[3:], # strip()去掉字符串首尾空白符 'time': item[4].strip()[5:], 'score': item[5]+item[6] } def write_to_file(content): # 编码转换 with open('result.txt', 'a', encoding='utf-8') as f: # dumps 将dict转换成str格式 # loads 将str转换成dict格式 f.write(json.dumps(content, ensure_ascii=False) + '\n') f.close() def main(offset): url = 'http://maoyan.com/board/4?offset=' + str(offset) html = get_one_page(url) #print(html) #parse_one_page(html) for item in parse_one_page(html): print(item) write_to_file(item) if __name__ == '__main__': #main() # for i in range(10): # 爬取多个页面,用for语句来生成翻页内容 # main(i*10) pool = Pool() # 开启进程池 pool.map(main,[i*10 for i in range(10)]) pool.close() pool.join()