计时器
def runtime(func):
def wrapper(*args, **kwargs):
start_time = time.perf_counter()
res = func(*args, **kwargs)
end_time = time.perf_counter()
cost_time = end_time - start_time
print(f'the function{func.__name__}\'s run time is{cost_time}')
return res
return wrapper
作用:使用装饰器查看函数的运行时间。
这里有一个小细节,关于time模块下的三个函数计时:time.time()、time.perf_counter()以及time.process_time()计时的区别,time()是使用用户时间计时,你在start到end这段时间更改自己电脑的系统时间,最后的cost time就会随之改变。perf counter是截取的标准时间,不会随着用户时间改变而改变。process time是进程时间,使用它会忽略time.sleep()经过的时间,有兴趣可以自己测试一下。
2. Ture
def fetch_sth():
global Ture, print
Ture = True
print = logging.DEBUG
作用:类似c语言中的#define ture true
习惯性打Ture,然后看着ide冒出小红曲线,再把它改成True。加个补丁后先跑一下,就再也不用看ide脸色形式了,甚至它还会告诉你哪个才是“真正的真”。print变debug不用多说,对我这种不用print找不到bug的渣渣来说绝对实用,找完bug直接打补丁,免得一行一行去注释print。
3. dns缓存
import socket
# from gevent import socket
_dnscache = {}
def setDNSCache():
def _getaddrinfo(*args, **kwargs):
if args in _dnscache:
return _dnscache[args]
else:
_dnscache[args] = socket._getaddrinfo(*args, **kwargs)
return _dnscache[args]
if not hasattr(socket, '_getaddrinfo'):
socket._getaddrinfo = socket.getaddrinfo
socket.getaddrinfo = _getaddrinfo
不用框架的爬虫必备,原理前面文章说过,不再多说。用法嘛,import socket后直接setDNSCache()就完事儿了。
4. 请求头格式化。
def get_headers(chrome_headers):
header_list = chrome_headers.split('\n')
headers = {}
for i in header_list:
if i == '': continue
l = i.split(':')
k = l[0]
v = ':'.join(l[1:]).strip()
headers[k] = v
return headers
函数主要是把复制粘贴的谷歌浏览器请求头变成字典形式
5. 发邮件
import smtplib
from email import encoders
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.header import Header
from email.mime.base import MIMEBase
config = {
"user":"xxxxxxxx@outlook.com",
"password":"xxxxxxxxx",
"mail_host":"smtp.office365.com",
"mail_port":"587"
}
def send_warning(title, content):
mail_host = config['mail_host']
mail_port = config['mail_port']
mail_user = config['user']
mail_pass = config['password']
sender = mail_user
receivers = 'xxxxxxxxx@xxxx.com'
message = MIMEMultipart()
message['From'] = Header("python email sender", 'utf-8')
subject = title
message['Subject'] = Header(subject, 'utf-8')
message.attach(MIMEText(content, 'plain', 'utf-8'))
try:
smtpObj = smtplib.SMTP(host=mail_host)
smtpObj.connect(mail_host, mail_port)
print('服务器连接成功')
smtpObj.set_debuglevel(True)
smtpObj.ehlo()
smtpObj.starttls()
smtpObj.ehlo()
smtpObj.login(mail_user,mail_pass)
print('登陆成功')
smtpObj.sendmail(sender, receivers, message.as_string())
print("邮件发送成功")
except smtplib.SMTPException as e:
print(e)
print("Error: 无法发送邮件")
if __name__ == '__main__':
send_warning('Test email', 'haha I\'m a superman!!!')
把config中的user和password改成自己的邮箱账号密码(需设置smtp开放权限),mail_host和mail_port改成对应邮箱的smtp host和port(outlook直接用我上面写的,其它自行百度),然后就可以通过send_warning来催自己改bug了。
6. bloom filter
from redis import client
class BloomFilter(object):
def __init__(self, *args, **kwargs):
"""
### 参数参考redis-py中connection的参数
### 主机一定要部署了redislabs/rebloom(docker直接运行即可,端口6379)
### 注意使用完一定要close关闭此实例
"""
self._cli = client.Redis(*args, **kwargs)
def list(self) -> list[str]:
"""
### 获取当前主机的过滤器名称列表
"""
res = self._cli.execute_command('keys *')
return [i.decode('utf8') for i in res]
def set(self, bf_name:str, error_rate:float=0.0001, max_size:int=1000000) -> bool:
"""
### 为容器添加一个布隆过滤器, 参数设置后不能更改
#### bf_name : 布隆过滤器名字(集合名)
#### error_rate : 允许误判率
#### max_size : 过滤器容量(超过容量会提升误判率)
"""
cmd_code = f'bf.reserve {bf_name} {error_rate} {max_size}'
try:
res = self._cli.execute_command(cmd_code)
return res
except:
raise ValueError(f'The filter "{bf_name}" is existed, please try other name')
def add(self, bf_name:str, value:str) -> bool:
"""
### 将一个值value加入过滤器bf_name
#### bf_name : 布隆过滤器名字(集合名)
#### value : 要加入的新值
"""
cmd_code = f'bf.add {bf_name} {value}'
res = self._cli.execute_command(cmd_code)
return bool(res)
def exists(self, bf_name:str, value:str) -> bool:
"""
### 判断一个值value是否在过滤器bf_name中
#### bf_name : 布隆过滤器名字(集合名)
#### value : 要判断的值
"""
cmd_code = f'bf.exists {bf_name} {value}'
res = self._cli.execute_command(cmd_code)
return bool(res)
def close(self):
self._cli.close()
if __name__ == '__main__':
bf = BloomFilter(host='xxx.xxx.xxx.xx')
bf.set('test')
res1 = bf.exists('test', 'www.youku.com') # false
bf.add('test', 'www.youku.com')
res2 = bf.exists('test', 'www.youku.com') # true
print(f'res1:{res1} \n res2:{res2}')
print('bf list:', bf.list())
bf.close()
scrapy版的布隆过滤器以前说过,这里来一个通用的框架,使用方法:
1.服务器运行docker run -d -p 6379:6379 redislabs/rebloom (下载并运行运行搭建好的布隆过滤器容器)
2.正常导入包即可,host为docker宿主机的公网ip,即服务器ip
另外结合这个布隆过滤器可以对requests进行扩展,使用补丁使requests的get方法可以直接去重。代码参考:
def fetch_url(bf_name:str='test55', max_size:int=10):
"""#### bf_name : 布隆过滤器名字(项目名,不存在会自动创建)#### max_size : 过滤器容量(超过容量会提升误判率)"""
_bf = BloomFilter(host='134.175.122.20')
try:
_bf.set(bf_name, max_size=max_size)
except:
pass
def _get(url, *args, **kwargs):
if _bf.exists(bf_name, url):
raise Exception(f'{url}在过滤器中已存在')
else:
res = requests.real_get(url, *args, **kwargs)
_bf.add(bf_name, url)
return res
class _session(requests.Session):
def __init__(self):
super().__init__()
self.real_get = self.get
self.get = self._get
def _get(self, url, *args, **kwargs):
if _bf.exists(bf_name, url):
raise Exception(f'{url}在过滤器中已存在')
else:
res = self.real_get(url, *args, **kwargs)
_bf.add(bf_name, url)
return res
使用如下:
7. 多线程
def run_fast(func:function, theard_num:int=200, work_total:int=500000, buffers:int=0, *args, **kwargs):
"""#### func: 需要执行多线程的函数#### theard_num: 线程数#### work_total: 任务次数#### buffers: 0 == ALL_COMPLETED 1 == FIRST_COMPLETED others == FIRST_EXCEPTION#### *args: func的*args"""
executor = futures.ThreadPoolExecutor(max_workers=theard_num)
future_tasks = [executor.submit(func, *args, **kwargs) for i in range(work_total)]
if buffers == 0:
b = futures.ALL_COMPLETED
elif buffers == 1:
b = futures.FIRST_COMPLETED
else:
b = futures.FIRST_EXCEPTION
futures.wait(future_tasks, return_when=b)
我这种懒人日常用这个= =多进程同理
8. 9. 10. 略
其实自己还把pymysql之类日常要用的超过三句话的代码都简单封装成了一个函数,不过没必要都放出来,主要是根据个人业务需求来修改。主要是提供个思路吧,就酱紫。
END!