该楼层疑似违规已被系统折叠 隐藏此楼查看此楼
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import urllib.request
import urllib
import urllib.error
import re
import sys
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
}
url = 'https://book.douban.com/top250'
class bookTop250:
def __init__(self):
#设置默认编码格式为utf-8
self.start = 0
self.param = 'no-cache'
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
self.movieList = []
self.filePath = 'E:/crawler/DoubanTop250.txt'
def getPage(self):
try:
URL = 'https://book.douban.com/top250' + str(self.start)
req = urllib.request(url=URL, headers=self.headers)
response = urllib.request.urlopen(req)
page = response.read().decode('utf-8')
pageNum = (self.start + 25) / 25
print( '正在抓取第' + str(pageNum) + '页数据...')
self.start += 25
return page
except urllib.error.URLError as e:
if hasattr(e, 'reason'):
print('抓取失败,具体原因:', e.reason)
def getMovie(self):
pattern = re.compile(u'
.*?.*?'+u'
.*?[\n][\s]+[\n][\s]+'+u'(.*?)[\n][\s]+(.*?)'
, re.S)
while self.start <= 225:
page = self.getPage()
movies = re.findall(pattern, page)
for movie in movies:
self.movieList.append([movie[0], movie[1], movie[2].lstrip(),
movie[3].lstrip, movie[4]])
def writeTxt(self):
fileTop250 = open(self.filePath, 'w')
try:
for movie in self.movieList:
fileTop250.write('shuming:' + movie[0] + '\r\n')
fileTop250.write('zuozhe:' + movie[1] + '\r\n')
fileTop250.write('pingfen:' + movie[2] + '\r\n')
fileTop250.write('renshu:' + movie[3] + '\r\n')
fileTop250.write('导演姓名:' + movie[4] + '\r\n')
print('文件写入成功...')
finally:
fileTop250.close()
def main(self):
print('正在从豆瓣电影Top250抓取数据...')
self.getMovie()
self.writeTxt()
print('抓取完毕...')
DouBanSpider = bookTop250()
DouBanSpider.main()
在pycharm上运行后报错如下,百度了半天不知如何修改了?求大佬解答如何修改才能正确运行