爬取代码
import urllib.request
from urllib import error
from urllib.parse import quote
from bs4 import BeautifulSoup
import time
from mylog import MyLog
class MaoYan(object):
index=None
image=None
title=None
actor=None
time=None
score=None
class GetMaoYan(object):
def __init__(self):
self.url='http://maoyan.com/board/4?'
self.mylog=MyLog()
self.pages=self.getPages(self.url)
self.items=self.spider(self.url,self.pages)
self.pipelines(self.items) #存txt文档
def getPages(self,url):
time.sleep(2)
try:
response=urllib.request.urlopen(url)
except error.URLError as e:
self.mylog.error('爬取%s失败,原因%s' %(url,e))
else:
self.mylog.debug('爬取%s成功' %url)
htmlResponse=response.read()
soup=BeautifulSoup(htmlResponse,"lxml")
pageTigs=soup.find_all('ul',attrs={'class':'list-pager'}) #返回一个结果集
for i in pageTigs:
li_content=i.find('a',attrs={'class':'page_10'}).get_text().strip()
return li_content
def spider(self,url,pages):
items=[]
for i in range(0,int(pages)):
time.sleep(2)
pageurl=url+'offset='+str(i*10)
self.mylog.debug("开始爬取第%s页"%(i+1))
response=urllib.request.urlopen(pageurl)
htmlResponse=response.read()
soup=BeautifulSoup(htmlResponse,"lxml")
dltags=soup.find_all('dl',attrs={'class':'board-wrapper'})#返回一个结果集 返回[<dl>.........</dl>]
for dd in dltags:
ddtags=dd.find_all('dd') #返回[<dd>.........</dd>]
for d in ddtags: #取到每个dd标签
my=MaoYan()
my.index=d.find('i').get_text().strip()
p_name=d.find('p',attrs={'class':'name'})
my.title=p_name.find('a',attrs={'data-act':'boarditem-click'}).get_text().strip()
my.actor=d.find('p',attrs={'class':'star'}).get_text().strip()
my.score=d.find('i',attrs={'class':'integer'}).get_text().strip()+d.find('i',attrs={'class':'fraction'}).get_text().strip()
my.time=d.find('p',attrs={'class':'releasetime'}).get_text().strip()
my.image=d.find('img',attrs={'class':'board-img'}).get('data-src')
items.append(my)
self.mylog.debug("爬取电影名为%s信息成功" %(my.title))
return items
def pipelines(self,items):
bookName='猫眼电影.txt'
with open(bookName,'w+',encoding='utf8') as f:
for item in items:
f.write('%s\t%s\t%s\t%s\t%s\t%s\n'%(item.index,item.title,item.actor,item.score,item.time,item.image))
if __name__=='__main__':
gmy=GetMaoYan()
mylog.py如下:
import logging
import getpass
import sys
class MyLog(object):
def __init__(self):
self.user=getpass.getuser()
self.logger=logging.getLogger(self.user)
self.logger.setLevel(logging.DEBUG)
#取出日志名
self.logName=sys.argv[0][0:-3]+'.log'
#定义日志格式
self.formatter=logging.Formatter('%(asctime)-12s %(filename)s %(funcName)s %(name)s %(message)s\r\n')
#定义处理器
self.fileHandler=logging.FileHandler(self.logName,encoding='utf8')
self.fileHandler.setFormatter(self.formatter)
self.fileHandler.setLevel(logging.DEBUG)
self.streamHandler=logging.StreamHandler()
self.streamHandler.setFormatter(self.formatter)
self.streamHandler.setLevel(logging.DEBUG)
#添加处理器
self.logger.addHandler(self.fileHandler)
self.logger.addHandler(self.streamHandler)
#按级别输出的方法
def debug(self,msg):
self.logger.debug(msg)
def error(self,msg):
self.logger.error(msg)
def warn(self,msg):
self.logger.warn(msg)
def info(self,msg):
self.logger.info(msg)
def critical(self,msg):
self.logger.critical(msg)