昨天说的多线程+数据库存储没有实现出来,只是做了数据库和爬虫的封装,明天继续完成。
数据库封装:
'''
Created on 2017年12月7日
@filename: SqlClass.py
@author: geng
'''
import pymysql
import traceback
class SqlOpt:
__link = ""
__conn = ""
__linkTuple = ("localhost", "root", "123456", "test")
__charset = "utf8"
def __init__(self):
try:
self.__link = pymysql.connect(charset = self.__charset, *self.__linkTuple)
except:
print(traceback.print_exc())
# dml操作,无返回值
def exec_dml(self, sql):
try:
self.__conn = self.__link.cursor()
self.__conn.execute(sql)
self.__link.commit()
return 1
except:
self.__link.rollback()
print(traceback.print_exc())
return 0
# dql操作,返回查询数据
def exec_dql(self, sql):
try:
self.__conn = self.__link.cursor()
self.__conn.execute(sql)
res = self.__conn.fetchall()
return res
except:
print(traceback.print_exc())
return None
def link_close(self):
if(self.__link):
self.__link.close()
'''
link = SqlOpt()
sql = "select * from tb_record limit 0, 10"
res = link.exec_dql(sql)
print(res)
'''
爬虫封装:
'''
Created on 2017年12月7日
@filename: SpiderClass.py
@author: geng
'''
import requests
import re
class SpiderOpt:
def __init__(self, method, url, **kwargs):
self.__response = requests.request(method, url, **kwargs)
def get_response(self):
return self.__response.text
def regular_opt(self, pat, data):
com = re.compile(pat)
rst = com.findall(data)
return rst
工具类封装:
'''
Created on 2017年12月7日
@filename: HelperClass.py
@author: geng
'''
import time
import random
class Helper:
def get_time(self):
t = str(time.time()*1000).split('.')
return t
Main函数:
'''
Created on 2017年12月7日
@filename: main.py
@author: geng
'''
from com.geng.SpiderClass import SpiderOpt
from com.geng.HelperClass import Helper
helper = Helper()
# 内衣
# url = "https://rate.tmall.com/list_detail_rate.htm?itemId=547773818796&spuId=842179060&sellerId=907782288&order=3&append=0&content=1&tagId=&posi=&picture=&ua=098%23E1hvB9vnvPOvUvCkvvvvvjiPPLqWzjY8RLs9sj3mPmPWljl8RLzvljtWRFqWAjlW9phvHnQGNVinzYswzv5b7MJgzRjw9HuCdphvmpvUG9U4V9v1agwCvvpvCvvv2QhvCvvvMMGCvpvVvmvvvhCvmphvLvA4dQvjEGLIAXZTKFEw9Exrs8TJEcqUAj7Q%2Bul1occ63Wv7rjlEgnLv%2B2Kz8Z0vQRAn%2BbyDCwFIAXZTKFEw9Exr08TJnDeDyO2vHd8tvpvIvvvvvhCvvvvvvUEpphvvs9vv9DCvpvQovvmmZhCv2jhvvUEpphvWw4yCvv9vvUvQORQH1UyCvvOUvvVvayptvpvhvvvvv8wCvvpvvUmmdphvmpvWrUpGPvC1nLyCvvpvvvvv&isg=AurqQavURICRWchqI2pb1fXnO1CGWGXUUQpYDnSi0z2Kp4lhXeg-xXOVQeVA&needFold=0"
# 大衣
url = "https://rate.tmall.com/list_detail_rate.htm?itemId=538581707711&spuId=700193432&sellerId=761456278&order=3&append=0&content=1&tagId=&posi=&picture=&ua=098%23E1hvwvvWvRyvUvCkvvvvvjiPPLqUtjtnRsMvgjEUPmP9AjECR2sO6jrPPFsW1jnm3QhvCvmvphm5vpvhvvCCBvhCvvOvChCvvvvEvpCW9a8ByBzhV4g7%2B3%2BuAj7JVXu4X9nr1CuKHdUf8rCl5F%2FAdcH2afmAdX9XjomxfBeKhqUf8rClHd8rejpiYPeAdX9XjLVxfXeKHs9lBdyCvm9vvhCvvvvvvvvvBJZvvUChvvCHtpvv9ZUvvhcDvvmCb9vvBJZvvUhKuphvmvvvpoH%2BZgApkphvC9hvpyPOAvGCvvpvvPMMRphvCvvvphmrvpvEvvV%2Busyv9X6c9phvHHiaTHk9zHi4c4uOts1N7rH4NYGBRphvCvvvphv%3D&isg=AlJSCV-sXM5zGqAyy2IzjS1voxg-YF2cubLwdhyrjoXnL_YpBPDWDbkN6b3o&needFold=0"
regular = [r'\"auctionSku\":\".*?\"', r'\"rateContent\":\".*?\"', r'\"rateDate\":\".*?\"']
info = []
for page in range(1, 5):
t = helper.get_time()
params = {
'currentPage' : page,
'_ksTS' : '{}_{}'.format(str(t[0]), str(t[1])),
'callback' : 'jsonp{}'.format(str(int(t[1]) + 1))
}
kv = {'User-Agent':'Googlebot'}
spider = SpiderOpt("GET", url, params = params, headers = kv)
data = spider.get_response()
# 0:颜色尺寸, 1:评论, 2:时间
for i in range(3):
info.append(spider.regular_opt(regular[i], data))
print("page : ", page, info.pop())