一个基于python的数据爬虫

最新推荐文章于 2023-12-04 17:48:32 发布

小白的菜刀

最新推荐文章于 2023-12-04 17:48:32 发布

阅读量854

点赞数

分类专栏： python 文章标签： python 数据爬虫

本文链接：https://blog.csdn.net/fallinlovelj/article/details/52106956

版权

python 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

import os
import sys
import time
import urllib
from urllib import request
import configparser
import re
import ctypes  
import logging  
import logging.handlers 
import datetime
# -*- coding:utf-8 -*- 
STD_INPUT_HANDLE = -10  
STD_OUTPUT_HANDLE= -11  
STD_ERROR_HANDLE = -12  

FOREGROUND_BLACK = 0x0  
FOREGROUND_BLUE = 0x01 # text color contains blue.  
FOREGROUND_GREEN= 0x02 # text color contains green.  
FOREGROUND_RED = 0x04 # text color contains red.  
FOREGROUND_INTENSITY = 0x08 # text color is intensified.  

BACKGROUND_BLUE = 0x10 # background color contains blue.  
BACKGROUND_GREEN= 0x20 # background color contains green.  
BACKGROUND_RED = 0x40 # background color contains red.  
BACKGROUND_INTENSITY = 0x80 # background color is intensified. 

class logger_t:
    def __init__(self):
        logname=time.strftime('%Y%m%d',time.localtime(time.time()))+".log"
        LOG_FILE = logname 

        handler = logging.handlers.RotatingFileHandler(LOG_FILE, maxBytes = 1024*1024, backupCount = 5) 
        fmt = '%(asctime)s - %(filename)s:%(lineno)s - %(name)s - %(message)s'  

        formatter = logging.Formatter(fmt)    
        handler.setFormatter(formatter)      

        self.logger = logging.getLogger('tst')      
        self.logger.addHandler(handler)            
        self.logger.setLevel(logging.DEBUG)  
    def info(self, str ):
        self.logger.info(str ) 
    def debug(self ,str ):
        self.logger.debug( str )


class config:
    '''
This Class is used to parser configfile
    '''
    def __init__(self):
        if not os.path.exists("../../config/config.ini" ):
            raise  IOError()
        self.cf=configparser.ConfigParser()
        self.cf.read( "../../config/config.ini" )
        #print("config constructor")
    def __del__(self):
        self.cf
    def GetValue(self , sec , key ):
        return self.cf.get( sec , key) 
class webParser:
    def __init__(self , config ):
        self.http_headers={
                 'Connection':'keep-alive',
                 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'
                 }
        self.config=config
    def __GetHtmlContent( self , url , encod="utf8"):
        req = urllib.request.Request( url ,headers= self.http_headers )
        fp=urllib.request.urlopen( req  ) ; 
        html=fp.read( )
        return  html.decode()
    def __pickout(self, dict_d ):
        s=dict_d['start']
        e=dict_d['end']
        reg=dict_d['reg']
        url=dict_d['url']
        html=self.__GetHtmlContent( url )
        index1=html.find( s , 0 )
        if index1 < 0 :
            return ''
        data = html[index1:]
        index2 =   data.find( e )  
        cnt=data[0: index2+len(e) ]
        #print( cnt )
        reg = re.compile(reg)  
        urllist = reg.findall( cnt )  
        return urllist
    def __createdir(self , dir ):
        if not os.path.isdir(dir ):
            os.mkdir(  dir )
    def __download(self ,url , head  ):
        t1=time.time()
        head['Accept']='image/webp,image/*,*/*;q=0.8'
        output=self.config.GetValue('Output','dir')
        self.__createdir( output )
        index=url.rfind("/" )
        picname=url[index:]
        #print(url )
        try:
            req = urllib.request.Request( url, headers=head )
            webpage = urllib.request.urlopen(req)
        except Exception as  e1:
            print(e1)
            return 0
        #print(len( webpage.read() ) )
        ids=open(output+picname  , "wb")
        try:
            pic_binary_data=webpage.read() 
        except Exception as  e:
            print( e )
            return 0
        size=len( pic_binary_data )
        total_size=0
        while True:
            s1=ids.write(  pic_binary_data )
            total_size +=s1
            if total_size== size:
                break
        #print(s1)
        ids.close()
        t2=time.time()
        #logger_t().info(  url+ " Cost "+str( (t2-t1)*1000 ) + " ms!!!size="  + str(size) +" !!!"   )
        print(url+ " Cost "+str( (t2-t1)*1000 ) + " ms!!!size="  + str(size) +" !!!" )
        time.sleep(0.1)
        return 1
    def work(self  ):
        url=config.GetValue("Input","url")
        baseurl_path=urllib.parse.urlparse(url).path
        baseurl=url[:url.find( baseurl_path )]
        dic1={"start":config.GetValue("PageDown", "start"),
        "end":config.GetValue("PageDown", "end") ,
        "reg":config.GetValue("PageDown", "reg")  ,
        "url":config.GetValue("Input","url") }
        pic_total_dl=0
        ts=time.time()
        for i in self.__pickout( dic1):
            refer=baseurl+i
            head=self.http_headers
            head['Referer']=refer
            dic={"start":config.GetValue("Element", "start"),
            "end":config.GetValue("Element", "end") ,
            "reg":config.GetValue("Element", "reg")  ,
            "url":refer } 
            piclist = self.__pickout(dic  )
            for pic in piclist:
                if self.__download(pic , head ) :
                    pic_total_dl=pic_total_dl+1
        te=time.time()
        print(str(pic_total_dl )  + " pictures has been download!!!,Total Cost " 
                 + str( (te-ts)*1000 ) + "ms!!!")
#class ThreadMng:


if __name__ == '__main__':
    config=config()
    wb=webParser(config )
    wb.work()