import re
from urllib.request import urlopen,Request
from multiprocessing import Pool
def get_page(url,pattern):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
req = Request(url=url, headers=headers)
response = urlopen(req).read().decode('utf-8')
return pattern,response # 正则表达式编译结果 网页源代码
def parse_page(info):
pattern,page_content=info
res=re.findall(pattern,page_content)
for item in res:
dic={
'index':item[0].strip(),
'title':item[1].strip(),
'actor':item[2].strip(),
'time':item[3].strip(),
'score': '%s%s'%(item[4].strip(), item[5].strip())
}
print(dic)
if __name__ == '__main__':
# regex = r'
.*?<.*?class="board-index.*?>(\d+).*?title="(.*?)".*?class="movie-item-info".*?(.*?)
.*?(.*?)
'# regex = r'
.*? .*?.*?(.*?).*?(.*?)
.*?(.*?)
'# regex = r'
.*?.*?.*?(.*?).*?(.*?)
.*?(.*?)
.*?(\d+\.?)(\d+)
.*?'# regex = r'
.*?.*?.*?(.*?).*?(.*?)
.*?(.*?)
'# regex = r'
.*?.*?(.*?).*?(.*?)
'regex = r'
.*?.*?(.*?).*?(.*?)
.*?(.*?).*?(.*?)
.*?'pattern1=re.compile(regex,re.S)
url_dic={'http://maoyan.com/board/7':pattern1}
p=Pool()
res_l=[]
for url,pattern in url_dic.items():
res=p.apply_async(get_page,args=(url,pattern),callback=parse_page)
res_l.append(res)
for i in res_l:
i.get()