使用urllib爬取猫眼电影排行榜信息

爬取代码

import urllib.request
from urllib import error 
from urllib.parse import quote
from bs4 import BeautifulSoup
import time
from mylog import MyLog

class MaoYan(object):
     index=None
     image=None
     title=None
     actor=None
     time=None
     score=None
     
class GetMaoYan(object):
    def __init__(self):
        self.url='http://maoyan.com/board/4?'
        self.mylog=MyLog()
        self.pages=self.getPages(self.url)
        self.items=self.spider(self.url,self.pages)
        self.pipelines(self.items)  #存txt文档
        
        
    def getPages(self,url):
        time.sleep(2)
        try:
            response=urllib.request.urlopen(url)
        except error.URLError as e:
            self.mylog.error('爬取%s失败,原因%s' %(url,e))
        else:
            self.mylog.debug('爬取%s成功' %url)
            htmlResponse=response.read()
        soup=BeautifulSoup(htmlResponse,"lxml")
        pageTigs=soup.find_all('ul',attrs={'class':'list-pager'})  #返回一个结果集
        for i in pageTigs:    
            li_content=i.find('a',attrs={'class':'page_10'}).get_text().strip()
        return li_content
        
        
        
        
    def spider(self,url,pages):
        items=[]
        for i in range(0,int(pages)):
            time.sleep(2)
            pageurl=url+'offset='+str(i*10)
            self.mylog.debug("开始爬取第%s页"%(i+1))
            response=urllib.request.urlopen(pageurl)
            htmlResponse=response.read()
            soup=BeautifulSoup(htmlResponse,"lxml")
            dltags=soup.find_all('dl',attrs={'class':'board-wrapper'})#返回一个结果集  返回[<dl>.........</dl>]
            for dd in dltags:
                ddtags=dd.find_all('dd')  #返回[<dd>.........</dd>]
                for d in ddtags: #取到每个dd标签
                    my=MaoYan()
                    my.index=d.find('i').get_text().strip()
                    
                    p_name=d.find('p',attrs={'class':'name'})
                    my.title=p_name.find('a',attrs={'data-act':'boarditem-click'}).get_text().strip()
                
                    my.actor=d.find('p',attrs={'class':'star'}).get_text().strip()
                
                    my.score=d.find('i',attrs={'class':'integer'}).get_text().strip()+d.find('i',attrs={'class':'fraction'}).get_text().strip()
                
                    my.time=d.find('p',attrs={'class':'releasetime'}).get_text().strip()
                
                    my.image=d.find('img',attrs={'class':'board-img'}).get('data-src')
                    items.append(my)
                    self.mylog.debug("爬取电影名为%s信息成功" %(my.title))
        return items
                
                
                

        
    def pipelines(self,items):
        bookName='猫眼电影.txt'
        with open(bookName,'w+',encoding='utf8') as f:
            for item in items:
                f.write('%s\t%s\t%s\t%s\t%s\t%s\n'%(item.index,item.title,item.actor,item.score,item.time,item.image))

    
    
    
if __name__=='__main__':
    gmy=GetMaoYan()

mylog.py如下:

import logging
import getpass
import sys


class MyLog(object):
    def __init__(self):
        self.user=getpass.getuser()
        self.logger=logging.getLogger(self.user)
        self.logger.setLevel(logging.DEBUG) 
        
        
        #取出日志名
        self.logName=sys.argv[0][0:-3]+'.log'
        
        #定义日志格式
        self.formatter=logging.Formatter('%(asctime)-12s %(filename)s %(funcName)s %(name)s %(message)s\r\n')
        
        #定义处理器
        self.fileHandler=logging.FileHandler(self.logName,encoding='utf8')
        self.fileHandler.setFormatter(self.formatter)
        self.fileHandler.setLevel(logging.DEBUG)
        
        self.streamHandler=logging.StreamHandler()
        self.streamHandler.setFormatter(self.formatter)
        self.streamHandler.setLevel(logging.DEBUG)
        
        #添加处理器
        self.logger.addHandler(self.fileHandler)
        self.logger.addHandler(self.streamHandler)
      
    #按级别输出的方法
    def debug(self,msg):
        self.logger.debug(msg)
        
    def error(self,msg):
        self.logger.error(msg)
        
    def warn(self,msg):
        self.logger.warn(msg)
        
    def info(self,msg):
        self.logger.info(msg)
        
    def critical(self,msg):
        self.logger.critical(msg)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值