import pymongo import time import urllib.request import re print("开始") #链接数据库 client=pymongo.MongoClient(host='localhost',port=27017) db=client.test collection=db.maoyan #获取html页面内容 def gethtml(url): time.sleep(2) response=urllib.request.urlopen(url) html=response.read().decode('utf-8') if response.status==200: return html else: return None
#正则表达式解析html页面内容并放入mongodb数据库 def getContent(html): pattern=re.compile( '<dd>.*?board-index.*?>(.*?)</i>.*?data-src="(.*?)".*?name.*?a.*?>(.*?)</a>.*?star.*?>(.*?)</p>.*?releasetime.*?>(.*?)</p>.*?integer.*?>(.*?)</i>.*?fraction.*?>(.*?)</i>.*?</dd>', re.S) items=re.findall(pattern,html) for item in items: yild={ 'index': item[0], 'image': item[1], 'title': item[2].strip(), 'actor': item[3].strip()[3:] if len(item[3]) > 3 else '', 'time': item[4].strip()[5:] if len(item[4]) > 5 else '', 'score': item[5].strip() + item[6].strip() } collection.insert(yild) #"将url放入mongodb中" url="http://maoyan.com/board/4?offset=" list=[] result=[] for i in range(6,10): html=gethtml(url+str(i*10)) getContent(html) print("结束")