# -*- coding:utf-8 -*-
import urllib2
import re
class MovieTop250 :
def __init__(self):
self.start = 0
self.movielist = []
def getPage(self):
try:
headers = {"User-Agent" : "Mozilla/5.0(Windows NT 6.1;WOW64)"}
url = "https://movie.douban.com/top250?start="+str(self.start)
request = urllib2.Request(url = url,headers = headers)
response = urllib2.urlopen(request)
page = response.read()
pageNum = str(self.start / 25)
print "正在抓取第" + pageNum + "页"
return page
except urllib2.URLError,e:
print e.reason
def getMovie(self):
pattern = re.compile('<em.*?class="">(.*?)</em>.*?'#网页源代码中属性那里仅有class,但写正则表达式要写成class=""
+'<a.*?>.*?<span class="title">(.*?)</span>.*?'
+'<span class="\w{5}">(.*?)</span>.*?'
+'<span class="rating_num".*?>(.*?)</span>.*?'
+'<span class="inq">(.*?)</span>',re.S)
while self.start <= 255:
page = self.getPage()
movies = re.findall(pattern, page)
for movie in movies:
self.movielist.append([movie[0], movie[1], movie[2].lstrip(' / '), movie[3], movie[4]])
# lstrip去掉左边的空格或者去掉对应的字符串
self.start += 25
def writeTXT(self):
with open('doubanmovietop250.txt','w+') as movietop250:
for movie in self.movielist:
movietop250.write('电影排名: '+ movie[0]+ '\r\n')
movietop250.write('电影名称: ' + movie[1] + '\r\n')
movietop250.write('原名别名: ' + movie[2] + '\r\n')
movietop250.write('累计评分: ' + movie[3] + '\r\n')
movietop250.write('简评: ' + movie[4] + '\r\n')
movietop250.write('\r\n')
#\r\n换行
doubanspider = MovieTop250()
doubanspider.getMovie()
doubanspider.writeTXT()
#分别启动两个函数
收获:
1.将类匹配进函数时,必须启动相关两个函数,也可以在用self.getMovie()在函数中启动另一个