import requests
from requests.exceptions import RequestException
import re
import json
from multiprocessing import Pool
def page_one_html(url):
try:
response = requests.get(url);
if response.status_code == 200:
return response.text
else:
return None
except RequestException:
return None
def parse_page_html(content):
pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?title="(.*?)".*?data-src="(.*?)".*?'
+'star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>'
+'.*?</dd>',re.S)
items = re.findall(pattern,content)
#print(items)
for item in items:
yield{
"index":item[0],
"title": item[1],
"image":item[2],
"actor":item[3].strip()[3:],
"createTime":item[4].strip()[4:],
"score":str(item[5])+str(item[6])
}
def write_text(item):
with open("result.txt","a",encoding="utf-8") as f:
f.write(json.dumps(item,ensure_ascii=False) + "\n")
f.close()
def main(offset):
url = "http://maoyan.com/board/4?offset="+str(offset)
html = page_one_html(url)
#print(html)
for item in parse_page_html(html):
write_text(item)
if __name__=="__main__":
pool = Pool()
pool.map(main,[i*10 for i in range(10)])
from requests.exceptions import RequestException 异常处理很重要
import re
import json
from multiprocessing import Pool 线程池下线搜搜的