#!/usr/bin/env python # -*- coding: utf-8 -*-
# 日志管理 import logging import sys reload(sys) sys.setdefaultencoding('utf-8') def getlogger(logName, logFile): logger=logging.getLogger(logName) logger.setLevel(logging.DEBUG) screenHandle = logging.StreamHandler() screenHandle.setLevel(logging.DEBUG) fileHandle = logging.FileHandler(logFile,'a') fileHandle.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') screenHandle.setFormatter(formatter) fileHandle.setFormatter(formatter) logger.addHandler(fileHandle) logger.addHandler(screenHandle) return logger
mysql.conf
[mysql]
user=你的root
password=你的password
database=你的database
host=localhost
port =3306
requests_to_mysql.py
#!/usr/bin/env python # -*- coding: utf-8 -*- import ConfigParser import json import random import sys import time import pymysql import requests import log_config import datetime logger = log_config.getlogger('reference_mysql', 'reference_mysql.log') conf = ConfigParser.ConfigParser() conf.read("mysql.conf") user = conf.get("mysql", "user") password = conf.get("mysql", "password") database = conf.get("mysql", "database") host = conf.get("mysql", "host") port = conf.get("mysql", "port") siteURL = '你要爬取得请求' fileurl = '可能爬取路径需要拼接的域名' headers = {'Host': '爬取网站的域名', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)' ' Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3103.400 QQBrowser/9.6.11372.400'} #你爬取的网站可能有很多层条件去过滤,所以你都需要列举处理,一般也包括时间段 cate_dict = {'key':'value'} moudue_dict = {'key': 'value'} industry_dict = {'key':'value'} date_list = ['2018-10-10'] date = time.strftime('%Y-%m-%d', time.localtime(time.time())) logger.info("start get %s data" % date) # 启动参数决定是否爬取今天的还是所有的历史数据sys.argv为list,启动不带参数sys.argv[0]默认为当前文件所在位置 if len(sys.argv) != 1: if sys.argv[1] == 'all': date = '' else: logger.info('input error,please input all') exit() # 获取总页数 def get_page(dates, category, mod, industry): data = {'seDate': dates, 'pageNum': 1, 'pageSize': 30, 'category': cate_dict[category], 'column': 'szse', 'plate': mod, 'tabName': 'fulltext', 'trade': industry} req = requests.post(siteURL, headers=headers, data=data) content = req.text content = json.loads(content) # filelist = content['announcements'] filesum = content['totalAnnouncement'] # print filesum if filesum != 0: if filesum % 30 == 0: pages = filesum / 30 else: pages = filesum / 30 + 1 return pages else: return 0 # 获取一页数据 def get_page_data(dates, category, page, module_type, industry): # 当前时间必须通过下面方式获取,否者mysql datetime类型不能接受该参数 now_date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") data = {'seDate': dates, 'pageNum': page, 'pageSize': 30, 'category': cate_dict[category], 'column': 'szse', 'plate': module_type, 'tabName': 'fulltext', 'trade': industry} logger.info("getting page %s" % str(page)) retries = 0 content = "" while retries < 3: try: req = requests.post(siteURL, headers=headers, data=data) content = req.text break except Exception as e: logger.error("get data failed", e) retries += 1 logger.info('req error retry %s ' % retries) # logger.info('req error retry %s '%retries) t = random.uniform(1, 2) time.sleep(t) try: content = json.loads(content) filelist = content['announcements'] logger.info("filelist=%s" % len(filelist)) page_datas = [] for fileone in filelist: # 文件处理状态,mysql中的 pro_status = 0 # java中解析url重试次数,这里不用管,默认设为0 retry_count = 0 sec_code = fileone['secCode'] sec_name = fileone['secName'] announcement_title = fileone['announcementTitle'] announcement_time = fileone['announcementTime'] public_time = date_long_to_str(announcement_time) adjunct_url = fileurl + fileone['adjunctUrl'] page_data = [category, cate_dict[category], industry_dict[industry], module_type, public_time, public_time, sec_code, sec_name, announcement_title, adjunct_url, pro_status, retry_count, now_date, now_date] page_datas.append(page_data) if len(page_datas) > 0: set_data_mysql(page_datas) except Exception as e: logger.error( 'get this page detail error... [cat:' + category + ' industry:' + industry + '' ' module_type:' + module_type + ' date:' + dates + ']', e) # 批量插入mysql def set_data_mysql(page_datas): # 创建连接 conn = pymysql.connect(host=host, port=int(port), user=user, passwd=password, db=database) # 创建游标 cursor = conn.cursor() sql = "INSERT INTO test(这里有14个字段) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" effect_row = cursor.executemany(sql, page_datas) # 提交sql,不提交不会进入mysql conn.commit() logger.info("already into dabatabase %s" % effect_row) # # 下面两行是单行插入 # # listOne = ('年度报告', 'category_ndbg_szsh;', dt) # # effect_row = cursor.execute(sql, listOne) # conn.commit() #需要提交来进入数据库 # print effect_row # long转str类型时间1539187200000 1539001526000->2018-10-08 20:25:26 def date_long_to_str(long_date): if long_date == "" or long_date == 0: return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") fommat_time = time.localtime(long(long_date)/1000) time_str = time.strftime("%Y-%m-%d %H:%M:%S", fommat_time) return time_str # 全局循环爬取 def collect_cate(): if date == '': for seDate in date_list: for mod in moudue_dict: for category in cate_dict: for industry in industry_dict: #logger.info("category=%s, mod=%s, industry=%s" % (category, mod, industry)) pages = get_page(seDate, category, moudue_dict[mod], industry) #logger.info("pages = %s" % pages) for page in range(1, pages + 1): get_page_data(seDate, category, page, moudue_dict[mod], industry) else: for mod in moudue_dict: for category in cate_dict: for industry in industry_dict: #logger.info("category = %s, mod=%s, industry=%s" % (category, mod, industry)) pages = get_page(date, category, moudue_dict[mod], industry) #logger.info("pages = %s" % pages) if 0 != pages: for page in range(1, pages + 1): get_page_data(date, category, page, moudue_dict[mod], industry) if __name__ == "__main__": collect_cate()