常用项目工具集合
- sqlalchemy 使用on_duplicate_key_update
- False接口模块
- 日志模块
- 数据库连接模块
- sqlalchemy ORM创建模块
- scrapy items
- scrapy ProxyMiddleWare 代理中间件
- scrapy 请求头中间件
- scrapy pipeline
- 滑块登录验证
- 装饰器
- python将数据存入csv
- python将数据存入excel (.xlsx)
- python使用字符串调用类方法
- 通过字符串调用同级文件类方法
- 格式化公司名
- redis 分布式锁
- 无界面浏览器 driver
- 转化日期
- 下划线转驼峰
- 13位时间戳转日期
- 格式化日期
- 电话匹配
- 字典列表分组
- 查询两个日期间隔的所有时间日期 返回列表
sqlalchemy 使用on_duplicate_key_update
fields = ["enterprise_id","branch_name","person_charge","establishment_date","business_status"]
cache_data_list = [{
"enterprise_id":3220082718061682058,
"branch_name":"南京合诺建设工程有限公司溧水分公司",
"person_charge":"-",
"establishment_date":"-",
"business_status":"-",
},{
"enterprise_id":3220082718061682058,
"branch_name":"南京合诺建设工程有限公司六合分公司",
"person_charge":"-",
"establishment_date":"-",
"business_status":"-",
}]
insert_stmt = insert(LNBranchOrganize).values(cache_data_list)
dup = {k: getattr(insert_stmt.inserted, k) for k in fields}
on_duplicate_key_stmt = insert_stmt.on_duplicate_key_update(**dup)
with session_scope(LongNaNewOriginSession) as session:
session.execute(on_duplicate_key_stmt)
on_duplicate_key_update根据fields 提供的字段进行更新数据,同时根据表结构中的唯一索引,实现更新,如果fields 有字段改变,则update_time会进行更新,如果fields 中的字段与数据库完全一致,则update_time不会更新,需要再fields 中添加"update_time"字段,这样,无论字段是否更变,update_time都会更新。
False接口模块
flask_app = Flask(__name__)
flask_app.config['BASIC_AUTH_USERNAME'] = ''
flask_app.config['BASIC_AUTH_PASSWORD'] = ''
basic_auth = BasicAuth(flask_app)
@flask_app.route('/home', methods=['get'])
def home_page():
"""home index"""
try:
return render_template('home_index.html')
except Exception as e:
return send_data(msg=e.args, code=ERROR_FAILED_FLAG)
# 返回json数据
def send_data(msg, code, data=None):
# 返回数据结构
return_data_dic = {
"msg": msg,
"code": code,
"data": data
}
# 字典转json
return_data_json = json.dumps(return_data_dic, ensure_ascii=False)
# 返回json数据
return return_data_json
if __name__ == '__main__':
flask_app.run(host=HOST_URL, port=HOST_PORT)
falsk接口文件是比较常见的工具,用于提供大量可调用的接口,以方便完成后期开发工作的使用。
日志模块
from logging.config import dictConfig
from logging.handlers import *
from os.path import dirname
PROJECT_ROOT = dirname(dirname(os.path.abspath(__file__))).replace('\\', '/')
LOG_PATH = os.path.join(PROJECT_ROOT, 'logs', '项目名.log')
# 将文件路径分割出来
file_dir = os.path.split(LOG_PATH)[0]
# 判断文件路径是否存在,如果不存在,则创建,此处是创建多级目录
if not os.path.isdir(file_dir):
os.makedirs(file_dir)
LOG_CONFIG = {
'version': 1,
'formatters': {
'standard': {
'format': '%(asctime)s [%(threadName)s:%(thread)d] [%(name)s:%(lineno)d] \ '
' [%(levelname)s]- %(message)s'
},
# 其他的 formatter
},
'handlers': {
'console': {
'class': 'logging.StreamHandler',
'level': 'DEBUG',
'formatter': 'standard'
},
'file': {
'class': 'logging.FileHandler',
'filename': LOG_PATH,
'level': 'WARNING',
'formatter': 'standard'
},
# 其他的 handler
},
'loggers': {
'spider': {
# 既有 console Handler,还有 file Handler
'handlers': ['console', 'file'],
'level': 'DEBUG',
},
# 其他的 Logger
}
}
dictConfig(LOG_CONFIG)
def setup_logger(name, log_file=None, level=logging.INFO, propagate=False):
"""Function setup as many loggers as you want"""
if not log_file:
log_file = name + '.log'
# 设置 logger handler 及每个 handler 处理的最低日志级别
formatter = logging.Formatter('%(asctime)s [%(threadName)s:%(thread)d] [%(name)s:%(lineno)d] \ '
' [%(levelname)s]- %(message)s')
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
stream_handler.setLevel(logging.DEBUG)
file_handler = RotatingFileHandler(os.path.join(PROJECT_ROOT, 'logs', log_file), maxBytes=20 * 1024 * 1024,
backupCount=5, encoding='utf-8')
file_handler.setFormatter(formatter)
file_handler.setLevel(level)
# 生成 logger 并加入 handlers 以及 logger 处理的最低日志级别
logger = logging.getLogger(name)
# 避免日志重复输出
if not logger.handlers:
logger.addHandler(stream_handler)
logger.addHandler(file_handler)
logger.setLevel(logging.DEBUG)
logger.propagate = propagate
return logger
# 示例
extract_origin_logger = setup_logger('extract_origin_logger', level=logging.ERROR)
自定义日志模块,完成日志的输出,
可自定义输出的内容及格式:
formatter = logging.Formatter('%(asctime)s [%(threadName)s:%(thread)d] [%(name)s:%(lineno)d] \ '
' [%(levelname)s]- %(message)s')
保存文件的大小及版本数量:backupCount=5
数据库连接模块
# 清洗数据库连接
NEW_ORIGIN_DATA_CONF = {
"url": 'mysql+pymysql://root:123456@127.0.0.1:3306/new_origin_data',
"max_overflow": 10,
"pool_size": 200,
"pool_timeout": 60,
"pool_recycle": 60 * 60 * 2
}
# 示例
# 清洗数据库链接
engine = create_engine(
NEW_ORIGIN_DATA_CONF['url'],
max_overflow=NEW_ORIGIN_DATA_CONF['max_overflow'], # 超过连接池大小外最多创建的连接
pool_size=NEW_ORIGIN_DATA_CONF['pool_size'], # 连接池大小
pool_timeout=NEW_ORIGIN_DATA_CONF['pool_timeout'], # 池中没有线程最多等待的时间,否则报错
pool_recycle=NEW_ORIGIN_DATA_CONF['pool_recycle'] # 多久之后对线程池中的线程进行一次连接的回收(重置)
)
NewOriginDataSession = sessionmaker(bind=engine)
# 多线程连接使用
local_conn_session = scoped_session(NewOriginDataSession)
# redis连接
# 服务配置信息
REDIS_URI = "redis://:123456@r127.0.0.1:6379/{}"
# 存放代理的库
PROXY_REDIS_CONN = StrictRedis().from_url(REDIS_URI.format(5))
MYSQL连接及REDIS连接
sqlalchemy ORM创建模块
import time
from sqlalchemy import Column, String, Integer, Index, Text
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.dialects.mysql import MEDIUMTEXT
from data import cache_origin_data_engine
base_db = declarative_base()
false = False
set_default = ''
class ProvinceData(base_db):
"""
省份名称数据
"""
__tablename__ = "r_company_province_data"
# id自增主键
id = Column(Integer, primary_key=True, autoincrement=True)
# 公司名称
company_name = Column(String(124), nullable=False, index=True)
# 所属省份
area_code = Column(Integer, nullable=False, default=9999)
# 其他标签
other_tags = Column(Integer, nullable=False, default=9999)
# 创建时间
create_time = Column(String(24), nullable=False, default=int(time.time()))
# 更新时间
update_time = Column(String(24), nullable=False, default=int(time.time()))
__table_args__ = (
Index("i_province", "area_code", "company_name"), # 联合索引
)
根据前期设计表结构,使用orm创建数据库表。
scrapy items
import scrapy
class MultiDataItem(scrapy.Field):
"""
数据库表结构一致的数据工用这个item
"""
# 所属表的类名
table_name = scrapy.Field()
# 统一社会信用代码
unified_social_credit_code = scrapy.Field()
# 公司名称
company_name = scrapy.Field()
# 资源数据
data_uri = scrapy.Field()
# 更新时间戳
update_time_stamp = scrapy.Field()
# 原始数据
origin_data = scrapy.Field()
# 添加使用爬虫脚本
crawl_script = scrapy.Field()
# 是否被处理
do_flag = scrapy.Field()
# 其他标签
other_tags = scrapy.Field()
# 暂留数据
data_json = scrapy.Field()
对应数据表结构创建scrapy的item,便于数据存储。
scrapy ProxyMiddleWare 代理中间件
class ProxyMiddleWare(object):
"""
代理中间件
"""
def process_request(self, request, spider):
"""对所有请求加上代理"""
proxy = proxy_tool.get_proxy(reboot_name=spider.name)['http']
request.meta["proxy"] = proxy
def process_response(self, request, response, spider):
""" 对返回的 response 处理"""
# 如果返回的 response 状态以4开头,更换代理,重新生成request对象
if str(response.status).startswith('4'):
proxy = proxy_tool.get_proxy(reboot_name=spider.name)['http']
request.meta['proxy'] = proxy
return request
return response
scrapy 请求头中间件
class RandomUserAgentMiddleware(object):
"""
添加随机user-agent
"""
def __init__(self, user_agent):
self.user_agent = user_agent
@classmethod
def from_crawler(cls, crawler):
return cls(user_agent=crawler.settings.get('MY_USER_AGENT'))
def process_request(self, request, spider):
"""对所有请求添加代理"""
# 如果是手机,就不重置
if request.headers['User-Agent'].decode('utf8') in MY_USER_AGENT['mobile_user_agent']:
pass
else:
agent = random.choice(self.user_agent["pc_user_agent"])
request.headers['User-Agent'] = agent
def bulid_proxys(self, pxy):
"""
构造代理
"""
proxys = {'http': pxy, 'https': pxy.replace('p', 'ps')}
return proxys
scrapy pipeline
class IntermediaryAgentPipeline(object):
def process_item(self, item, spider):
if isinstance(item, IntermediaryAgentTtem):
try:
now_date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
json_item = item.__dict__
for key, value in json_item.items():
if not isinstance(value, str):
continue
json_item[key] = value.replace("\r\n", "").replace(" ", "")
with session_scope(NewOriginDataSession) as session:
json_item['create_time'] = now_date
json_item['update_time'] = now_date
session.execute(IntermediaryAgentData.__table__.insert(), [json_item])
except Exception as e:
spider.log_reboot.error(f'error file:{e.__traceback__.tb_frame.f_globals["__file__"]}'
f' , error line:{e.__traceback__.tb_lineno}, error message:{e.args}')
滑块登录验证
def login_check(self,company_pwd):
"""
登录验证
:return:
"""
driver = start_chrome()
try:
unified_social = company_pwd.get("unified_social")
pwd = company_pwd.get("pwd")
driver.get('http://sfrz.xzzwfw.gov.cn:8081/sfrz/ids/netUserLogin/oauth2.do')
time.sleep(0.1)
legal_login = driver.find_element(By.CLASS_NAME, "li02")
time.sleep(0.1)
legal_login.click()
input_username = driver.find_element(By.ID, "frusername")
input_username.send_keys(unified_social)
time.sleep(0.1)
input_pwd = driver.find_element(By.ID, "frpassword")
input_pwd.send_keys(pwd)
time.sleep(0.1)
slider = driver.find_element(By.ID, "btn02")
result = self.move_to_gap(slider, driver)
if result == ERROR_POST:
return ERROR_POST
driver.find_element_by_link_text("登录").click()
driver.implicitly_wait(3)
time.sleep(1)
try:
result = driver.find_element_by_xpath('//*[@id="errorDiv1"]')
if result.text == '用户名、密码是否正确?请重试。':
return ERROR_POST
else:
# todo 其他情况待定
print(result.text)
print(pwd)
return ERROR_POST
except Exception as e:
check_pwd.error(f'error file:{e.__traceback__.tb_frame.f_globals["__file__"]} , '
f'error line:{e.__traceback__.tb_lineno}, error message:{e.args}')
check_pwd.info('密码验证成功!')
return SUCCEED_POST
except Exception as e:
check_pwd.error(f'error file:{e.__traceback__.tb_frame.f_globals["__file__"]} , '
f'error line:{e.__traceback__.tb_lineno}, error message:{e.args}')
driver.close() # 切记关闭浏览器,回收资源
finally:
driver.quit() # 切记关闭浏览器,回收资源
def move_to_gap(self, slider, driver):
"""
移动滑块
:return:
"""
move_length = 330
ActionChains(driver).click_and_hold(slider).perform()
ActionChains(driver).move_by_offset(xoffset=int(move_length), yoffset=0).perform()
time.sleep(0.5)
ActionChains(driver).release().perform()
time.sleep(0.5)
result = re.findall(r'<div id="slide_xbox02" style="width: 330px;">(.*?)<div id="btn02">', driver.page_source)
if result:
check_pwd.info("验证通过")
return SUCCEED_POST
else:
return ERROR_POST
装饰器
def gevent_start():
"""
gevent fun add
"""
def decorator(func):
def wrapper(*args, **kwargs):
try:
path_data = args[0]
print(f'--gevent start fun: {path_data} start--')
return [Thread(target=func, args=(path_data,), name='check_name').start()]
except Exception as e:
decorators_logger.error(f'error file:{e.__traceback__.tb_frame.f_globals["__file__"]}'
f' , error line:{e.__traceback__.tb_lineno}, error message:{e.args}')
return wrapper
return decorator
python将数据存入csv
def load_company_certificate():
"""
Crawl表
获取省份对应的省份编号
:return:
"""
session_crawl = etl_business_session()
try:
my_list_of_area_code = [320000,330000,350000,370000,510000,440000]
for area_code in my_list_of_area_code:
origin_data_list = session_crawl.query(CacheQualificationCompanyAssociation.origin_data).filter_by(other_tags=area_code).yield_per(1000)
with open(f"./temp_files/{area_code}.csv", "w+", encoding='utf-8',newline="") as f:
csv_writer = csv.writer(f)
csv_writer.writerow(["xx", "xx","xx", "xx", "xx", "xx", "xx"])
for origin_data in origin_data_list:
origin_data_one = json.loads(origin_data[0])
companyName = origin_data_one.get('message').get('companyName')
certificateNo = origin_data_one.get('message').get('certificateNo')
majorName = origin_data_one.get('message').get('majorName')
tradeName = origin_data_one.get('message').get('tradeName')
levelName = origin_data_one.get('message').get('levelName')
approveDate = origin_data_one.get('message').get('approveDate')
approveOrgan = origin_data_one.get('message').get('approveOrgan')
csv_writer.writerow([companyName,certificateNo,majorName,tradeName,levelName,approveDate,approveOrgan])
mysql_dump.info(f"{area_code}数据转入完成")
except Exception as e:
session_crawl.rollback()
finally:
session_crawl.commit()
python将数据存入excel (.xlsx)
def pd_to_excel():
"""
重点省份数据批量导出到xlsx
:return:
"""
session_crawl = etl_business_session()
my_list_of_area_code = [320000, 330000, 350000, 370000, 510000, 440000]
for area_code in my_list_of_area_code:
origin_data_list = session_crawl.query(CacheQualificationCompanyAssociation.origin_data).filter_by(
other_tags=area_code).yield_per(1000)
companyName_list = []
certificateNo_list = []
majorName_list = []
tradeName_list = []
levelName_list = []
approveDate_list = []
approveOrgan_list = []
for origin_data in origin_data_list:
origin_data_one = json.loads(origin_data[0])
companyName = origin_data_one.get('message').get('companyName')
companyName_list.append(companyName)
certificateNo = origin_data_one.get('message').get('certificateNo')
certificateNo_list.append(certificateNo)
majorName = origin_data_one.get('message').get('majorName')
majorName_list.append(majorName)
tradeName = origin_data_one.get('message').get('tradeName')
tradeName_list.append(tradeName)
levelName = origin_data_one.get('message').get('levelName')
levelName_list.append(levelName)
approveDate = origin_data_one.get('message').get('approveDate')
approveDate_list.append(approveDate)
approveOrgan = origin_data_one.get('message').get('approveOrgan')
approveOrgan_list.append(approveOrgan)
dict_data = {
"xx": companyName_list,
"xx": certificateNo_list,
"xx": majorName_list,
"xx": tradeName_list,
"xx": levelName_list,
"xx": approveDate_list,
"xx": approveOrgan_list
}
df = pd.DataFrame(dict_data)
writer = pd.ExcelWriter(f'./temp_files/{area_code}.xlsx') # 创建数据存放路径
df.to_excel(writer, index=False)
writer.save() # 文件保存
writer.close() # 文件关闭
mysql_dump.info(f"{area_code}数据转入完成")
python使用字符串调用类方法
def monitor_factory(monitor):
"""
监控类工厂
:param monitor: 需要被监控的脚本名称
:return: monitor class object
"""
# 找出模块里所有的类名
clsmembers = importlib.import_module("utils.site_data_monitor")
for class_name, obj in inspect.getmembers(clsmembers, inspect.isclass):
class_obj = eval(class_name)
try:
spider_name = class_obj.spider
except:
continue
if monitor == spider_name:
return class_obj
“utils.site_data_monitor” 为文件下的py,该方法获取site_data_monitor.py下的所有类名称,在使用eval进行转换为对象
通过字符串调用同级文件类方法
getattr(self, self.spider_name)()
getattr 函数用于返回一个对象属性值。
jieba分词
# JIEBA_DICT 为自定义分词字典
jieba.load_userdict(JIEBA_DICT)
# 高频词组统计
words = jieba.lcut(company_name_str)
result = Counter(words)
格式化公司名
def format_filed_data(filed_data):
"""
格式化公司名称
:param company_name: 公司名
:return:
"""
# 格式化括号
format_filed = filed_data.replace("(", "(").replace(")", ")")
# 去除所有空格
base_format_filed = format_filed.replace(" ", "")
# 判断是否为纯英文名
result = base_format_filed.encode('utf-8').isalpha()
new_format_filed = format_filed.strip() if result else base_format_filed
return new_format_filed
redis 分布式锁
def acquire_lock_with_timeout(lock_name: str, acquire_timeout: int = 3, lock_timeout: int = 2):
"""
基于 Redis 实现的分布式锁
:param lock_name: 锁的名称
:param acquire_timeout: 获取锁的超时时间,默认 3 秒
:param lock_timeout: 锁的超时时间,默认 2 秒
:return:
"""
identifier = str(uuid.uuid4())
lockname = f'lock:{lock_name}'
lock_timeout = int(math.ceil(lock_timeout))
end = time.time() + acquire_timeout
while time.time() < end:
# 如果不存在这个锁则加锁并设置过期时间,避免死锁
if LOCK_REDIS_CONN.setnx(lockname, identifier):
LOCK_REDIS_CONN.expire(lockname, lock_timeout)
return identifier
# 如果存在锁,且这个锁没有过期时间则为其设置过期时间,避免死锁
elif LOCK_REDIS_CONN.ttl(lockname) == -1:
LOCK_REDIS_CONN.expire(lockname, lock_timeout)
time.sleep(0.001)
return False
def release_lock(lockname, identifier):
"""
释放锁
:param lockname: 锁的名称
:param identifier: 锁的标识
:return:
"""
# python中redis事务是通过pipeline的封装实现的
with LOCK_REDIS_CONN.pipeline() as pipe:
lockname = 'lock:' + lockname
while True:
try:
# watch 锁, multi 后如果该 key 被其他客户端改变, 事务操作会抛出 WatchError 异常
pipe.watch(lockname)
iden = pipe.get(lockname)
if iden and iden.decode('utf-8') == identifier:
# 事务开始
pipe.multi()
pipe.delete(lockname)
pipe.execute()
return True
pipe.unwatch()
break
except WatchError:
pass
return False
无界面浏览器 driver
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver import Chrome, ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def start_chrome():
chrome_options = Options()
chrome_options.add_argument('--no-sandbox') # 解决DevToolsActivePort文件不存在的报错
chrome_options.add_argument('window-size=1920x3000') # 指定浏览器分辨率
chrome_options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_argument('--hide-scrollbars') # 隐藏滚动条, 应对一些特殊页面
chrome_options.add_argument('blink-settings=imagesEnabled=false') # 不加载图片, 提升速度
chrome_options.add_argument('--headless') # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
chrome_driver = f"{CHILD_PATH_ROOT}/utils/temp_files/chromedriver.exe" # 手动指定使用的浏览器位置(87.0.4280.88)
driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=chrome_driver)
return driver
转化日期
now_date = datetime.datetime.now().strftime('%Y%m%d')
下划线转驼峰
def str2Hump(self, text):
"""
下划线转驼峰
:param text: 字典key
:return:
"""
arr = filter(None, text.lower().split('_'))
res = ""
j = 0
for i in arr:
res = i if j == 0 else res + i[0].upper() + i[1:]
j += 1
return res
13位时间戳转日期
def time_stamp_to_date(self, time_stamp):
"""
13位时间戳转日期
:return:
"""
if len(str(time_stamp)) == 13 or len(str(time_stamp)) == 10:
if self.format_date(str(time_stamp)):
return time_stamp
change_date = str(time_stamp)[:10]
time_array = time.localtime(int(change_date))
other_style_time = time.strftime("%Y-%m-%d", time_array)
return other_style_time
else:
return '-'
格式化日期
def format_date(self, date):
"""
格式化日期
:return:
"""
format_date = date.replace("年", "-").replace("月", "-").replace("日", "").replace("/", "-") if date else ""
base_date = "".join(re.findall('\d{4}-\d{1,}-\d{1,}', format_date)) if format_date else None
if not base_date:
return base_date
data_list = base_date.split("-")
if len(data_list[1]) == 1:
data_list[1] = "0" + data_list[1]
if len(data_list[2]) == 1:
data_list[2] = "0" + data_list[2]
new_date = "-".join(data_list)
return new_date
电话匹配
def filtration_phone(self, phones):
"""
过滤不合格的电话数据
:param data:
:return:
"""
try:
phones = str(phones)
fix_phone = re.findall(r'\d{4}-\d{7}|\d{3}-\d{8}', phones)
if fix_phone:
return "".join(fix_phone[0]), 2
move_phone = re.findall('[1][0-9]{10}', phones)
if move_phone:
return "".join(move_phone[0]), 1
else:
return False, False
except Exception as e:
etl_start_logger.error(f'error file:{e.__traceback__.tb_frame.f_globals["__file__"]} , '
f'error line:{e.__traceback__.tb_lineno}, error message:{e.args}')
字典列表分组
data_list 为包含字典数据的列表
下面代码的意思是,将数据,根据ent_id进行分组,保存到不同的新列表,并按分组发送到kafka
data_list.sort(key=itemgetter('ent_id'))
for data, items in groupby(data_list, key=itemgetter('ent_id')):
try:
group_by_data = []
total = 0
for one in items:
total = one.get("total") if not one.get("total") else 0
group_by_data.append(one)
if len(group_by_data) >= total :
bson_data = format_data(group_by_data, config_info)
KAFKAPRODUCER.send("dwd_crawler_data", bson_data)
except Exception as e:
logger.error(f"kafka数据发送报错:{e}")
continue
查询两个日期间隔的所有时间日期 返回列表
def getEveryDay(begin_date, end_date):
# 前闭后闭
date_list = []
start_date = datetime.datetime.strptime(begin_date, "%Y-%m-%d")
end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d")
while start_date <= end_date:
date_str = start_date.strftime("%Y-%m-%d")
date_list.append(date_str)
start_date += datetime.timedelta(days=1)
return date_list