# coding=utf-8
import requests
import re
import json
from io import open
#from multiprocessing import Pool
from requests.exceptions import RequestException
headers = {'User-Agent':'Mozilla/5.0 '}
def get_one_page(url):
try:
res = requests.get(url,headers = headers)
if res.status_code == 200:
#print res.text
print url
return res.text
return None
except RequestException:
return None
def parse_one_page(html):
#pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
#+'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
#+'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S)
#items = re.findall(pattern,html)
items=re.findall(r'<dd>.*?class="board-index board-index-.*?</dd>',html,re.S)
print items
content=[]
for item1 in items:
item=re.search(r'<dd>.*?i class="board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
+'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
+'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',item1,re.S)
li={}
li['index']=item.group(1)
li['image'] = item.group(2)
li['title'] = item.group(3)
li['actor'] = item.group(4).strip()[3:]
li['time'] = item.group(5).strip()[5:]
li['score'] = item.group(6)+item.group(7)
content.append(li)
#print content
return content
#yield{
#'index': item[0],
#'image': item[1],
#'title': item[2],
#'actor': item[3].strip()[3:],
#'time': item[4].strip()[5:],
#'score': item[5] + item[6]
#}
def write_to_file(content):
with open ('res.txt', 'a',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii=False))
f.close()
def main():
for i in range(0,10):
url = 'http://maoyan.com/board/4?offset=' + str(i*10)
html = get_one_page(url)
content=parse_one_page(html)
print content
#for item in parse_one_page(html):
#print(item)
write_to_file(content)
if __name__ == '__main__':
#p = Pool()
#p.map(main, [i * 10 for i in range(10)])
main()
爬虫 利用正则表达式爬取猫眼电影
最新推荐文章于 2023-11-29 10:53:22 发布