import re from multiprocessing.pool import Pool import requests from requests import RequestException def get_one_page(url): try: response = requests.get(url) if response.status_code == 200: return response.text return None except RequestException: return None def parse_one_page(html): pattern = re.compile( '<li>.*?alt="(.*?)".*?src="(.*?)".*?<p class="">(.*?)</p>.*?<span>(.*?)</span>.*?class="inq">(.*?)</span>.*?</li>', re.S) 正则的写法尽量用非贪婪匹配,即.*?并且最后加上re.S表示。可代表换行,匹配先找前面的类似于 关键字如src class=‘’等匹配完还要加上本标签的闭标签<p class="">(.*?)</p> items = re.findall(pattern, html)查找所有的 for item in items: yield { yield 可理解为一个每一次调用就返回这些值 item[0],括号中的第几个 # 'pic': item[1], # 'actor': item[2].strip(' '), # 'commentSum': item[3], item[4] } def write_to_file(content): with open('result.tx', 'a', encoding='utf-8') as ff: encoding中文 ff.write(str(content) + '\n') ff.close() def main(start): url = 'https://movie.douban.com/top250?start=' + str(start) html = get_one_page(url) # parse_one_page(html) for item in parse_one_page(html):for 循环迭代yield返回值 print(item) write_to_file(item) if __name__ == '__main__':类似于main方法,开头 pool = Pool()线程池 pool.map(main, [i * 25 for i in range(10)])方法名,迭代for
request re 250
最新推荐文章于 2023-05-13 10:52:54 发布