Python小知识点整理

你很棒滴

已于 2022-10-19 16:54:17 修改

阅读量734

点赞数

文章标签： python 爬虫 node.js

于 2021-05-25 09:25:37 首次发布

本文链接：https://blog.csdn.net/RayMand168/article/details/117248418

版权

字符串转数字

str = '1,2,3,4'
int_list = list(map(int,str.split(',')))
# output:[1,2,3,4]

retry函数的应用

import time
from retry import retry

'''记录初始时刻'''
start_time = time.time()


@retry(delay=1,tries=10,backoff=2,max_delay=20)
def demo(start_time):
    '''将当前时刻与初始时刻的时间差（单位：S）作差并四舍五入'''
    print(round(time.time()-start_time,0))
    raise

demo(start_time)

为程序添加进度条

from tqdm import tqdm
import time

for i in tqdm(range(10)):
	pass
	time.sleep(0.5)

参考链接:
https://www.cnblogs.com/feffery/p/13392024.html

faker 函数

# -*- coding: utf-8 -*-
# @Author   : FELIX
# @Date     : 2018/6/30 9:49

from faker import Factory

# zh_CN 表示中国大陆版
fake = Factory().create('zh_CN')
# 产生随机手机号
print(fake.phone_number())
# 产生随机姓名
print(fake.name())
# 产生随机地址
print(fake.address())
# 随机产生国家名
print(fake.country())
# 随机产生国家代码
print(fake.country_code())
# 随机产生城市名
print(fake.city_name())
# 随机产生城市
print(fake.city())
# 随机产生省份
print(fake.province())
# 产生随机email
print(fake.email())
# 产生随机IPV4地址
print(fake.ipv4())
# 产生长度在最大值与最小值之间的随机字符串
print(fake.pystr(min_chars=0, max_chars=8))

# 随机产生车牌号
print(fake.license_plate())

# 随机产生颜色
print(fake.rgb_color())  # rgb
print(fake.safe_hex_color())  # 16进制
print(fake.color_name())  # 颜色名字
print(fake.hex_color()) # 16进制

# 随机产生公司名
print(fake.company())


# 随机产生工作岗位
print(fake.job())
# 随机生成密码
print(fake.password(length=10, special_chars=True, digits=True, upper_case=True, lower_case=True))
# 随机生成uuid
print(fake.uuid4())
# 随机生成sha1
print(fake.sha1(raw_output=False))
# 随机生成md5
print(fake.md5(raw_output=False))

# 随机生成女性名字
print(fake.name_female())
# 男性名字
print(fake.name_male())
# 随机生成名字
print(fake.name())

# 生成基本信息
print(fake.profile(fields=None, sex=None))
print(fake.simple_profile(sex=None))

# 随机生成浏览器头user_agent
print(fake.user_agent())

# 随机产生时间
fake.month_name()
# 'September'
fake.date_time_this_century(before_now=True, after_now=False, tzinfo=None)
# datetime.datetime(2010, 7, 21, 18, 52, 43)
fake.time_object(end_datetime=None)
# datetime.time(6, 39, 26)
fake.date_time_between(start_date="-30y", end_date="now", tzinfo=None)
# datetime.datetime(2013, 10, 11, 18, 43, 40)
fake.future_date(end_date="+30d", tzinfo=None)
# datetime.date(2018, 7, 8)
fake.date_time(tzinfo=None, end_datetime=None)
# datetime.datetime(2006, 9, 4, 20, 46, 6)
fake.date(pattern="%Y-%m-%d", end_datetime=None)
# '1998-08-02'
fake.date_time_this_month(before_now=True, after_now=False, tzinfo=None)
# datetime.datetime(2018, 6, 8, 9, 56, 24)
fake.timezone()
# 'Africa/Conakry'
fake.date_time_this_decade(before_now=True, after_now=False, tzinfo=None)
# datetime.datetime(2017, 6, 27, 21, 18, 28)
fake.month()
# '04'
fake.day_of_week()
# 'Wednesday'
fake.iso8601(tzinfo=None, end_datetime=None)
# '1988-02-28T09:22:29'
fake.time_delta(end_datetime=None)
# datetime.timedelta(10832, 82660)
fake.date_object(end_datetime=None)
# datetime.date(2005, 8, 18)
fake.date_this_decade(before_today=True, after_today=False)
# datetime.date(2015, 1, 5)
fake.date_this_century(before_today=True, after_today=False)
# datetime.date(2000, 6, 1)
fake.date_this_month(before_today=True, after_today=False)
# datetime.date(2018, 6, 13)
fake.am_pm()
# 'AM'
fake.past_datetime(start_date="-30d", tzinfo=None)
# datetime.datetime(2018, 6, 25, 7, 41, 34)
fake.date_this_year(before_today=True, after_today=False)
# datetime.date(2018, 2, 24)
fake.date_time_between_dates(datetime_start=None, datetime_end=None, tzinfo=None)
# datetime.datetime(2018, 6, 26, 14, 40, 5)
fake.date_time_ad(tzinfo=None, end_datetime=None)
# datetime.datetime(673, 1, 28, 18, 17, 55)
fake.date_between_dates(date_start=None, date_end=None)
# datetime.date(2018, 6, 26)
fake.future_datetime(end_date="+30d", tzinfo=None)
# datetime.datetime(2018, 7, 4, 10, 53, 6)
fake.century()
# 'IX'
fake.past_date(start_date="-30d", tzinfo=None)
# datetime.date(2018, 5, 30)
fake.time(pattern="%H:%M:%S", end_datetime=None)
# '01:32:14'
fake.day_of_month()
# '19'
fake.unix_time(end_datetime=None, start_datetime=None)
# 1284297794
fake.date_time_this_year(before_now=True, after_now=False, tzinfo=None)
# datetime.datetime(2018, 5, 24, 11, 25, 25)
fake.date_between(start_date="-30y", end_date="today")
# datetime.date(2003, 1, 11)
fake.year()
# '1993'
fake.time_series(start_date="-30d", end_date="now", precision=None, distrib=None, tzinfo=None)
# <generator object time_series at 0x7f44e702a620>


# 随机产生文件
fake.file_extension(category=None)
# 'xls'
fake.file_name(category=None, extension=None)
# '表示.csv'
fake.file_path(depth=1, category=None, extension=None)
# '/教育/客户.js'
fake.unix_device(prefix=None)
# '/dev/sdf'
fake.unix_partition(prefix=None)
# '/dev/vdf0'
fake.mime_type(category=None)
# 'multipart/form-data'

用itertools排列

import itertools
name= 'Python'
for i in itertools.permutations(name):
    print(i)
# out: 
('P', 'y', 't', 'h')
('P', 'y', 'h', 't')
('P', 't', 'y', 'h')
('P', 't', 'h', 'y')
('P', 'h', 'y', 't')
('P', 'h', 't', 'y')
('y', 'P', 't', 'h')
('y', 'P', 'h', 't')
('y', 't', 'P', 'h')
('y', 't', 'h', 'P')
('y', 'h', 'P', 't')
('y', 'h', 't', 'P')
('t', 'P', 'y', 'h')
('t', 'P', 'h', 'y')
('t', 'y', 'P', 'h')
('t', 'y', 'h', 'P')
('t', 'h', 'P', 'y')
('t', 'h', 'y', 'P')
('h', 'P', 'y', 't')
('h', 'P', 't', 'y')
('h', 'y', 'P', 't')
('h', 'y', 't', 'P')
('h', 't', 'P', 'y')
('h', 't', 'y', 'P')

一串数字与数字列表的转换

num = 123456

# using map
list_of_digits = list(map(int, str(num)))
print(list_of_digits)

# [1, 2, 3, 4, 5, 6]

# using list comprehension
list_of_digits = [int(x) for x in str(num)]
print(list_of_digits)

# [1, 2, 3, 4, 5, 6]

列表取样

import secrets 
# imports secure module.

secure_random = secrets.SystemRandom() 
# creates a secure random object.

my_list = ['a','b','c','d','e']

num_samples = 2

samples = secure_random.sample(my_list, num_samples)

print(samples)

# [ 'e', 'd'] 
# this will have any 2 random values

列表清单扁平化

from iteration_utilities import deepflatten
# if you only have one depth nested_list, use this

def flatten(l):
	return [item for sublist in l for item in sublist]

l = [[1,2,3],[3]]
print(flatten(l))
# [1, 2, 3, 3]

# if you don't know how deep the list is nested
l = [[1,2,3],[4,[5],[6,7]],[8,[9,[10]]]]
print(list(deepflatten(l, depth=3)))
# [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

# 若有正确格式化的数组，Numpy扁平化是更佳选择。

两字典的合并

dict_1 = {'apple': 9, 'banana': 6}

dict_2 = {'banana': 4, 'orange': 8}

combined_dict = {**dict_1, **dict_2}
# **必须存在
print(combined_dict)

# Output

# {'apple': 9, 'banana': 4, 'orange': 8}

python配置(configparser)文件的使用

class Read_Config:
	from configparser import ConfigParser

    def __init__(self,file_path=None):
        if file_path:
            config_path = file_path
        else:
            config_path = os.path.join(file_abspath, 'config.ini')
			# file_abspath 文件的相对路径
        self.cf = ConfigParser()
        self.cf.read(config_path,encoding='utf-8')

    def get_mysql_info(self,param):
        value = self.cf.get('mysql_database',param)
        return value

    def get_cookie(self,param):
        cookies = self.cf.get('requests_set',param)
        return cookies

    def get_redis_info(self,param):
        value = self.cf.get('redis_database',param)
        return value


# config.ini 文件格式
[redis_database]
host = xxxxxxxx
pwd = 123456
db = 3

# 调用
r = Read_Config()
self.redis_conn = redis.Redis(host=r.get_redis_info('host'), port=6379, db=r.get_redis_info('db'), encoding='utf-8',password=r.get_redis_info('pwd'))

python脚本执行完自动关机

os.system('shutdown -s -f -t 60')
# 程序最后加入这句即可。

jsonpath（提取json数据）

jsonpath常用语法
根节点($)
当前节点(@)
子节点(.或[])
任意节点(*)
任意后代节点(..)
条件筛选(?(@.键名 比较符(==,<,>) 值))

在这里插入图片描述

import json
from jsonpath import jsonpath

# 读入示例json数据
with open('json示例.json', encoding='utf-8') as j:
    demo_json = json.loads(j.read())

# 配合JSONPath表达式提取数据
jsonpath(demo_json, '$..steps[*].duration')

输出结果

参考链接：
https://mp.weixin.qq.com/s/k5Dw4lZOAqGDsm5BZSGSjw

eval函数处理字符串转字典，列表

eval("[1,2,3]")
# output [1,2,3] 列表
eval("{'1':'22'}")
# output {'1':'22'} 字典

参考链接：
https://www.cnblogs.com/who-care/p/9306800.html

简单验证码识别

import ddddocr

ocr = ddddocr.DdddOcr()

with open('./企业微信截图_20210825103615.png','rb') as f:
    img_byte = f.read()
res = ocr.classification(img=img_byte)

print(res)

cookie的获取方式

#1.用selenium登录获取
cookie = {}
driver.get_cookie()
for item in cookie_list:
   	cookie[item['name']] = item['value']
html = requests.get(cookies=cookie)

#2.session获取
s = requests.session().get(url)
cookies = s.cookie

# 3.browser_cookie3库
import browser_cookie3
import requests
from requests.utils import dict_from_cookiejar

# url = 'https://twitter.com/home'
url = 'https://www.csdn.net/'
cj = browser_cookie3.chrome() # firefox可以替换为browser_cookie3.firefox()
cookie_dict = dict_from_cookiejar(cj)
# print(cookie_dict)

r = requests.get(url, cookies=cj)
# print(r.status_code)
print(r.text)

参考连接：
https://zhuanlan.zhihu.com/p/54202593
https://zhuanlan.zhihu.com/p/64366444
https://blog.csdn.net/qq_34022601/article/details/88700943

js逆向—PyExecJS

'''
1.打开登录界面，搜索(ctrl+shift+f)加密后的关键词，打断点、
2.重跑登录界面，看断点位置,找到相关函数。
3.复制到发条js调试工具上，进行调试。(缺啥补啥)，调试后的代码，复制到一个js文件
4.执行execjs库，进行读取转码(1，需要安装nodejs和pyexecjs）
'''

import execjs

node = execjs.get()
# 创建node对象
ctx = node.compile(open('../Universal/JS_PyExecJS/wx.js',encoding='utf-8').read())
# compile读取js文件
password = 'pwd'
function_name = f'getPwd({password}'
# js文件中的函数名
pwd = ctx.eval(function_name)
# 执行函数，得出加密后的信息，比对
print(function_name)

python写入csv 用Excel打开乱码的解决方法

from pandas import read_csv

csv_to_excel = read_csv(f'./all_data_update.csv', encoding='utf-8')
csv_to_excel.to_excel(f'./all_data_update.xlsx', sheet_name='all_data_update')

测试代理的有效性

mport random

import requests
from faker import Factory

proxies_list = [
    {'http': 'http://183.166.164.105:54227'},
    {'http': 'http://183.165.242.177:54220'},
        ]
headers = {
            'user-agent':Factory.create().user_agent()
        }
proxies = {
  "https": "http://183.166.164.105:54227",
  # "https": "http://10.10.1.10:1080",
}
print(random.choice(proxies_list))
a = requests.get('https://icanhazip.com', headers= headers, proxies=proxies)
print(a.text)

b = requests.get('http://icanhazip.com', headers= headers, proxies=random.choice(proxies_list))
print(b.text)

p = requests.get('http://icanhazip.com', headers= headers)
print(p.text)

# 固定格式
{'http':'http://ip:port'}
{'https':'http://ip:port'}
# 注意改变的只是字典的键！！！

官方文档：
https://docs.python-requests.org/zh_CN/latest/user/advanced.html#proxies
其他参考：
https://blog.csdn.net/weixin_43343144/article/details/107210942?utm_medium=distribute.pc_relevant.none-task-blog-2_defaultbaidujs_title~default-0.control&spm=1001.2101.3001.4242

装饰器的使用

def timer(func):
    import time
    def wrapper(*args,**kwargs):
        start_time = time.time()
        f = func(*args,**kwargs)
        end_time = time.time()
        pass_time = end_time - start_time
        print(f'执行{getattr(wrapper,"__name__")}程序用了{pass_time}秒')
        return f
    return wrapper


@ timer
def text():
    print('我就是试试功能')
    a = 'i just try something'
    return a


print(text())
# 有返回值的时候得调用

参考链接：
https://www.jb51.net/article/158814.htm

csv文件的读写

import csv

with open('text.csv','w',encoding='utf-8',newline='') as file:
	writer = csv.writer(file)
	writer.writerow(元组或者列表)
# 写入一个csv文件，newline='' 表示写入数据，一行接着一行，无空行分隔
headers = ['title','name','age']
with open('text.csv','w',encoding='utf-8',newline='') as file:
	writer = csv.DictWriter(file,headers)
	writer.writerhead()
	writer.writerow(字典）
# 写入字典

with open('text.csv','r',encoding='utf-8') as file:
	lines = csv.reader(file)
	for line in lines:
		print(line)
# 读csv文件

参考链接：
https://blog.csdn.net/katyusha1/article/details/81606175

随机user_agent

from faker import Factory
us = Factory.create().user_agent()
# 第一种

from fake_useragent import UserAgent
ua = UserAgent().chrome
# 第二种

Python enumerate() 函数

enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列，同时列出数据和数据下标，一般用在 for 循环当中。

Python 2.3. 以上版本可用，2.6 添加 start 参数。

seasons = ['Spring', 'Summer', 'Fall', 'Winter']
list(enumerate(seasons))
# 结果
[(0, 'Spring'), (1, 'Summer'), (2, 'Fall'), (3, 'Winter')]
list(enumerate(seasons, start=1)) 
# 结果
[(1, 'Spring'), (2, 'Summer'), (3, 'Fall'), (4, 'Winter')]

与for循环的结合：
seq = ['one', 'two', 'three']
for i, element in enumerate(seq):
	print i, element
# 结果
0 one
1 two
2 three

正则表达式去除字符串中的特殊符号

s = '*\/:?"<>|' #这9个字符在Windows系统下是不可以出现在文件名中的
str1 = '\巴拉<1"!11【】>1*hgn/p:?|' #样例

1.提取感兴趣（想要）的字符
a = re.findall('[\u4e00-\u9fa5a-zA-Z0-9]+',str1,re.S) #只要字符串中的中文，字母，数字
a = "".join(a)
print(a)
结果如下：
巴拉1111hgnp

2.只去除不想要的，比如只去除不能作为文件名的字符
a = re.findall(r'[^\*"/:?\\|<>]',str1,re.S)
a = "".join(a)
print(a)
结果如下：　　
巴拉1!11【】1hgnp

参考链接：
https://www.cnblogs.com/JIM-FAN/p/13122217.html

把一个程序和所用的库打包到一个新电脑使用

简单打包命令

pyinstaller [-F/-D] [-w/-c] [-i xxx.ico] xxx.py/xxx.spec

xxx.py/xxx.spec：需要打包的程序main文件或者spec文件。spec文件在使用py文件进行打包时会在相同路径下自动生成，spec中的内容也是根据命令行中输入的内容来生成的，也可以使用命令pyi-makespec [options] xxx.py来生成一个纯粹的spec文件，而不会去执行打包的操作。

-F/--onefile：将整个程序打包为一个exe文件，需要注意的是，与-D模式生成的exe程序相比，在启动速度上会慢一点，原因是它需要先解压exe文件并生成一个唯一的临时环境来运行程序，关闭环境时也会自动删除这个临时环境，-D模式的程序本身就是解压好的，运行完也不需要执行删除操作，当程序比较大时，这个差别就很明显了。

-D/--onedir：默认选项，与F/--onefile参数作用相反，将程序打包为一个文件夹，文件夹中包含启动程序的exe文件和其他依赖的资源文件和DLL文件等。

-w：表示程序运行后隐藏命令行窗口，当你不需要使用命令行窗口作为程序的I/O时，比如GUI程序，可以使用这个参数选项。

-c：默认选项，与-w相反，提供一个命令行窗口进行I/O。

-i/--icon：指定exe程序图标。

参考链接：
https://blog.csdn.net/weixin_39531229/article/details/110537890

python制作词云图

from wordcloud import WordCloud,ImageColorGenerator
import matplotlib.pyplot as plt
import jieba

path_text = './text.txt'
f = open(path_text,mode='r',encoding='utf-8').read()
# 打开所要展示的文本
cut_text = " ".join(jieba.cut(f))
backgroup_img = plt.imread('./apic33734.jpg')
# 用matplotlib.pyplot 读取图片

wordcloud = WordCloud(
   #设置字体，不然会出现口字乱码，文字的路径是电脑的字体一般路径，可以换成别的
    font_path="C:/Windows/Fonts/simfang.ttf",
   #设置了背景，宽高
    background_color="white",
   # background_color=None,
    mode='RGBA',
    width=1000,
    height=880,
    mask= backgroup_img
)
# 设置文字，背景，模式，长，高，蒙版
wordcloud = wordcloud.generate(cut_text)
wordcloud.recolor(color_func=ImageColorGenerator(backgroup_img))
# 先generate生成图，然后recolor重新上色
wordcloud.to_file('图云.png')

参考连接：
https://blog.csdn.net/qq_39611230/article/details/105954600

命令行运行python.py

python 程序.py的储存路径
# 虚拟机运行py文件时，确定python的版本python3.8 程序.py,
# 若不能直接拉取文件路径，需要进入到py文件所在路径
# 虚拟机不能运行run.py，需要用scrapy runspider xx.py 进行调用

检验proxy是否可用

import requests

url = 'https://www.baidu.com/'
# url = 'http://www.chinaz.com'

proxies = {'HTTPS': 'https://113.243.34.63:4243'}

html = requests.get(url,proxies=proxies)
html.encoding='utf-8'

print(html.status_code)
# print(html.text)
# print(html.headers)
# http客户端请求  http://www.chinaz.com
# https客户端请求 https://www.baidu.com

requests中处理cookie

import requests

def login():
    login_url = 'http://www.xxx.com/login
    headers = {
        "Accept": "application/json, text/javascript, */*; q=0.01"
    }
    body = {
        "usercode": "liuzz05@****.com",
        "password": "123456"
    }
    try:
        res = requests.post(url=login_url, headers=headers, data=body)
        cookies = res.cookies

        cookie = requests.utils.dict_from_cookiejar(cookies)
        # 得到的是一个字典

        return cookie
    except Exception as err:
        print('获取cookie失败：\n{0}'.format(err))

# 引用
import requests

def get_data():
   cookie = login()
   res = requests.get(url=get_data_url, cookies=cookie)
   print(res.text)

URL的汉字转码问题

编码

from urllib.parse import quote

keyword = '中国'
print(quote(keyword))
# 输出结果：%E4%B8%AD%E5%9B%BD

解码

from urllib.parse import unquote

print(unquote(("%E4%B8%AD%E5%9B%BD")))
# 输出结果： 中国

html是\uxxx\uxxx转汉字

html = requests.get(url=url,headers=headers)
content = eval("u" + "\'" + html.text + "\'")
print(content)
# 转成中文

参考网址：
https://blog.csdn.net/qq_29499107/article/details/83051660

xpath提取同一节点下多端文字，并去空格

normalize-space

item['title'] = html.xpath('normalize-space(string(//div/a/text()))')

xpath用文字定位提取href

contains

href = html.xpath('//a[@id="text" and contains(text(),"哈哈哈") and not (contains(text(),"666"))]/@href')

xpath 进阶用法

子标签/… 表示父标签，标签用/child:: 表同级下一个标签

参考链接：
https://www.cnblogs.com/feffery/p/10996526.html

js2py函数的用法

js2py.eval_js(js_string)
直接运行含有js代码的字符串（或js文件），并得出结果

js_string='var a=10'

js2py.eval_js(js_string) 
#输出10

js2py.EvalJs():
生成一个EvalJs对象
可通过该对象的execute方法来运行一段js代码（或js文件），并得到对应的变量和对象（即抑制输出，得到变量和对象，便于后续直接使用）
可通过该对象的eval()方法来运行一段js代码，并得到结果

import js2py
js_obj=js2py.EvalJs()
string='''
var a=10
function func(a,b){
    return a*b
}
'''
 
js_obj.execute(string)
js_obj.a #输出为10
js_obj.func #为func函数
js_obj.func(3,4) #输出为12

获取现在时间

import datetime

# 获得当前时间
now = datetime.datetime.now()
ts = now.strftime('%Y-%m-%d %H:%M:%S')

# GMT时间的转化
GMT_TIME = datetime.datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S GMT')

时间戳转化为现在时间

import time

timestamp = round(time.time())
array_time = time.localtime(timestamp)
my_time = time.strftime('%Y-%m-%d %H:%M:%S',array_time)
# 时间戳转化为本地时间

dd = "Fri Nov 09 2018 14:41:35 GMT+0800 (CST)"
GMT_FORMAT = '%a %b %d %Y %H:%M:%S GMT+0800 (CST)'
print(datetime.strptime(dd, GMT_FORMAT))
# 将GMT时间格式的字符串转换成datetime类型

if '天前' in post_time:
	time_1 = time.strftime('%Y-%m-%d',time.localtime(time.time()-int(post_time.replace('天前',''))*24*60*60))
                
elif '小时前' in post_time:
	time_1 = time.strftime('%Y-%m-%d',time.localtime(time.time() - int(post_time.replace('小时前', ''))*60*60))
             
 # 转换时间格式

参考链接：
https://blog.csdn.net/qq_35462323/article/details/83994563

任务框架APScheduler

APScheduler是Python的一个定时任务框架，用于执行周期或者定时任务，
可以基于日期、时间间隔，及类似于Linux上的定时任务crontab类型的定时任务；
该该框架不仅可以添加、删除定时任务，还可以将任务存储到数据库中，实现任务的持久化，使用起来非常方便。
安装方式：pip install apscheduler

apscheduler组件及简单说明：
1>triggers（触发器）：触发器包含调度逻辑，每一个作业有它自己的触发器
2>job stores（作业存储）:用来存储被调度的作业，默认的作业存储器是简单地把作业任务保存在内存中,支持存储到MongoDB，Redis数据库中
3> executors（执行器）：执行器用来执行定时任务，只是将需要执行的任务放在新的线程或者线程池中运行
4>schedulers（调度器）：调度器是将其它部分联系在一起,对使用者提供接口，进行任务添加，设置，删除。

import time
from apscheduler.schedulers.blocking import BlockingScheduler

def func():
    now = datetime.datetime.now()
    ts = now.strftime('%Y-%m-%d %H:%M:%S')
    print('do func  time :',ts)
 
def func2():
    #耗时2S
    now = datetime.datetime.now()
    ts = now.strftime('%Y-%m-%d %H:%M:%S')
    print('do func2 time：',ts)
    time.sleep(2)
 
def dojob():
    #创建调度器：BlockingScheduler
    scheduler = BlockingScheduler()
    #添加任务,时间间隔2S
    scheduler.add_job(func, 'interval', seconds=2, id='test_job1')
    #添加任务,时间间隔5S
    scheduler.add_job(func2, 'interval', seconds=3, id='test_job2')
    scheduler.start()
    
dojob()

输出结果中可以看到：任务就算是有延时，也不会影响其他任务执行。
APScheduler框架提供丰富接口去实现定时任务，可以去参考官方文档去查看使用方式。

参考链接：
https://juejin.cn/post/6844903872872333326
https://www.sohu.com/a/407444741_658944
实例链接：
https://blog.csdn.net/qq_21137441/article/details/112232995

Python类变量和实例变量（类属性和实例属性）

class Test:
	name = 'ray'
	def __init__(self,arg):
		self.arg = arg
	def __say__(self):
		a = 1
		print('hello world'*a)

# 其中Test为类，Test()为类对象，name为类属性，arg为实例属性，say为实例方法，a 为局部变量

参考链接
http://c.biancheng.net/view/2283.html

用grequests处理并发请求

import grequests
url_list = [f'www.baidu.com?page={i}' for i in range(10)]
req_list = [grequests.get(url=url,headers=headers) for url in url_list]
html_list = grequests.map(task)

for html in html_list :
	print(html.text)

注意：

1.应该先import grequest，然后在import requests ，否则会出现猴子错误！！
2. grequests.map()的gtimeout参数与请求无关，开启会出先None相应

参考连接
https://www.cnblogs.com/superhin/p/11583560.html

scrapy 中自动按照响应域名自动拼接url

def parse(self,response):
	href = response.xpath('')
	yield scrapy.Request(url=response.urljoin(href))

spider和crawlspider的请求传参

def get_parse(self, response):
	yield scrapy.Request(url=url,callback = self.parse.meta={'title':title}
	# 第一种写法
	request = scrapy.Request(url=newsUrl, callback=self.get_article)
	request.meta['newsTitle'] = newsTitle
    yield request
    # 第二种写法

def parse(self,response):
	title = response.meta['title']
	# 第一种方法接收
	item['newsTitle'] = response.meta['newsTitle']
	# 第二种方法接收

logging日志与定时器apschedule的结合

from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.events import EVENT_JOB_EXECUTED,EVENT_JOB_ERROR
import logging
from time import sleep
import time


logging.basicConfig(level=logging.INFO,
                    format=('%(asctime)s %(filename)s [line:%(lineno)d] %(levelname)s %(message)s)'),
                    datefmt='%Y-%m-%d %H:%M:%S',
                    # 日期的输出形式，对应formate的asctime
                    filename='./Picture/log1.txt',
                    # 日志的输出路径及名字
                    filemode='a',
                    )

def fun1():
    now = time.asctime()
    for i in range(10):
        print(f'现在是{now}',f'我在正在打印第{i}次')
        sleep(0.5)

def fun2():
    now = time.asctime()
    i = 0
    while i < 5:
        print(f'现在是{now}',f'我是数字{i},我在成长')
        i += 1
    print('整除函数',1/0)

def my_listener(event):

    if event.exception:
        print('程序出错')
    else:
        print('一切按计划形式')
	# 监听的是job_store中所有的任务
	
def dojob():
    scheduler = BlockingScheduler()
    scheduler.add_job(fun1,trigger='interval',minutes=0.5,seconds=1,id='interval_fun1')
    scheduler.add_job(fun2,trigger='interval',minutes=0.3,seconds=2,id='interval_fun2')
    # interval触发器的一种，用于特定时间段循环运用程序，还有corntab，用于具体日期循环运行程序
    scheduler.add_listener(my_listener,EVENT_JOB_EXECUTED | EVENT_JOB_ERROR)
    # 监听程序和监听的等级
    scheduler._logger = logging
    scheduler.start()

dojob()

参考链接：
https://www.cnblogs.com/xianyulouie/p/11041777.html

(?P…) 正则的用法

for html in html_list:
    obj = re.compile(r'"productId":(?P<num>.\d+?),', re.S)
    # 创建patten
    productID = obj.finditer(html.text)
    # 创建可迭代对象
    for id in productID:
        num = id.group('num')
        # 通过名字呼叫
        print(num)