from urllib import request as r
import re
class Spider():
url='https://movie.douban.com/chart'
root_pattern='<tr class="item">(.*?)</table>'
movie_name_pattern='<a class="nbg" href=".*?" title="(.*?)">'
movie_score_pattern='<span class="rating_nums">(.*?)</span>'
movie_number_pattern='<span class="pl">(.*?)</span>'
def __content(self):#读取内容
r1=r.urlopen(Spider.url)
htmls=r1.read()
htmls=str(htmls,encoding='utf-8')
return htmls
def __analysis(self,htmls):#分析
root_html=re.compile(Spider.root_pattern,re.S).findall(htmls)
informations=[]
for html in root_html:
name=re.compile(Spider.movie_name_pattern,re.S).findall(html)
score=re.compile(Spider.movie_score_pattern,re.S).findall(html)
number=re.compile(Spider.movie_number_pattern,re.S).findall(html)
information={'name':name,'score':score,'number':number}
informations.append(information)
return informations
def __refine(self,informations):#精炼
l=lambda informations:{
'name':informations['name'][0].strip,
'score':informations['score'][0].strip,
'number':informations['number'][0].strip
}
return map(l,informations)
def go(self):#运行
htmls=self.__content()
informations=self.__analysis(htmls)
#informations=list(self.__refine(informations))
print(informations)
Spider=Spider()
Spider.go()