Python3爬虫(面向对象)

昨天说的多线程+数据库存储没有实现出来,只是做了数据库和爬虫的封装,明天继续完成。

数据库封装:

'''
Created on 2017年12月7日
@filename: SqlClass.py
@author: geng
'''
import pymysql
import traceback
class SqlOpt:
    __link = "" 
    __conn = ""
    __linkTuple = ("localhost", "root", "123456", "test")
    __charset = "utf8"
    
    def __init__(self):
        try:
            self.__link = pymysql.connect(charset = self.__charset, *self.__linkTuple)
        except: 
            print(traceback.print_exc())
            
            
    # dml操作,无返回值            
    def exec_dml(self, sql):
        try:
            self.__conn = self.__link.cursor()
            self.__conn.execute(sql)
            self.__link.commit()
            return 1
        except:
            self.__link.rollback()
            print(traceback.print_exc())
            return 0
    
    # dql操作,返回查询数据 
    def exec_dql(self, sql):
        try:
            self.__conn = self.__link.cursor()
            self.__conn.execute(sql)
            res = self.__conn.fetchall()   
            return res 
        except:
            print(traceback.print_exc())
            return None    
            
            
    def link_close(self):
        if(self.__link):
            self.__link.close()
            
'''                     
link = SqlOpt()
sql = "select * from tb_record limit 0, 10"
res = link.exec_dql(sql)
print(res)   
'''         
           

 爬虫封装:

'''
Created on 2017年12月7日
@filename: SpiderClass.py
@author: geng 
'''
import requests
import re

class SpiderOpt:
    
    def __init__(self, method, url, **kwargs):
        self.__response = requests.request(method, url, **kwargs)
        
    def get_response(self):
        return self.__response.text
    
    def regular_opt(self, pat, data):
        com = re.compile(pat)
        rst = com.findall(data)
        return rst
        

 工具类封装:

'''
Created on 2017年12月7日
@filename: HelperClass.py
@author: geng
'''
import time
import random
class Helper:
    def get_time(self):
        t = str(time.time()*1000).split('.')
        return t

 Main函数: 

'''
Created on 2017年12月7日
@filename: main.py
@author: geng
'''
from com.geng.SpiderClass import SpiderOpt
from com.geng.HelperClass import Helper

helper = Helper()
# 内衣
# url = "https://rate.tmall.com/list_detail_rate.htm?itemId=547773818796&spuId=842179060&sellerId=907782288&order=3&append=0&content=1&tagId=&posi=&picture=&ua=098%23E1hvB9vnvPOvUvCkvvvvvjiPPLqWzjY8RLs9sj3mPmPWljl8RLzvljtWRFqWAjlW9phvHnQGNVinzYswzv5b7MJgzRjw9HuCdphvmpvUG9U4V9v1agwCvvpvCvvv2QhvCvvvMMGCvpvVvmvvvhCvmphvLvA4dQvjEGLIAXZTKFEw9Exrs8TJEcqUAj7Q%2Bul1occ63Wv7rjlEgnLv%2B2Kz8Z0vQRAn%2BbyDCwFIAXZTKFEw9Exr08TJnDeDyO2vHd8tvpvIvvvvvhCvvvvvvUEpphvvs9vv9DCvpvQovvmmZhCv2jhvvUEpphvWw4yCvv9vvUvQORQH1UyCvvOUvvVvayptvpvhvvvvv8wCvvpvvUmmdphvmpvWrUpGPvC1nLyCvvpvvvvv&isg=AurqQavURICRWchqI2pb1fXnO1CGWGXUUQpYDnSi0z2Kp4lhXeg-xXOVQeVA&needFold=0"  
# 大衣
url = "https://rate.tmall.com/list_detail_rate.htm?itemId=538581707711&spuId=700193432&sellerId=761456278&order=3&append=0&content=1&tagId=&posi=&picture=&ua=098%23E1hvwvvWvRyvUvCkvvvvvjiPPLqUtjtnRsMvgjEUPmP9AjECR2sO6jrPPFsW1jnm3QhvCvmvphm5vpvhvvCCBvhCvvOvChCvvvvEvpCW9a8ByBzhV4g7%2B3%2BuAj7JVXu4X9nr1CuKHdUf8rCl5F%2FAdcH2afmAdX9XjomxfBeKhqUf8rClHd8rejpiYPeAdX9XjLVxfXeKHs9lBdyCvm9vvhCvvvvvvvvvBJZvvUChvvCHtpvv9ZUvvhcDvvmCb9vvBJZvvUhKuphvmvvvpoH%2BZgApkphvC9hvpyPOAvGCvvpvvPMMRphvCvvvphmrvpvEvvV%2Busyv9X6c9phvHHiaTHk9zHi4c4uOts1N7rH4NYGBRphvCvvvphv%3D&isg=AlJSCV-sXM5zGqAyy2IzjS1voxg-YF2cubLwdhyrjoXnL_YpBPDWDbkN6b3o&needFold=0"
regular = [r'\"auctionSku\":\".*?\"', r'\"rateContent\":\".*?\"', r'\"rateDate\":\".*?\"']
info = []
for page in range(1, 5):
    t = helper.get_time()
    params = {
        'currentPage' : page,
        '_ksTS' : '{}_{}'.format(str(t[0]), str(t[1])),
        'callback' : 'jsonp{}'.format(str(int(t[1]) + 1))
    }
    kv = {'User-Agent':'Googlebot'}
    spider = SpiderOpt("GET", url, params = params, headers = kv)
    data = spider.get_response()
    # 0:颜色尺寸, 1:评论, 2:时间
    for i in range(3):
        info.append(spider.regular_opt(regular[i], data))
        print("page : ", page, info.pop())
 

 

转载于:https://my.oschina.net/gain/blog/1586446

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值