python常用库笔记

lllhhhv

已于 2022-02-25 10:00:02 修改

阅读量1.3k

点赞数

分类专栏： python 文章标签： python

于 2022-02-17 14:51:34 首次发布

本文链接：https://blog.csdn.net/lllhhhv/article/details/122966240

版权

python 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

文章目录

log
PostgreSQL
simplejson
requests
elasticsearch

pip install

log

import logging

# 修改类名
logger = logging.getLogger(__name__)

# 修改日志级别
logging.basicConfig(level= log_level)  # log_level: DEBUG INFO WARNING ERROR

# 打印
logging.debug()
logging.info()
logging.warning()
logging.error('Protocol problem: %s', 'connection reset')

# 打印异常
try:
	...
 except Exception as e:
    logger.exception(e)
	raise

logging是python官方模块, https://docs.python.org/3/library/logging.html#logging.debug

PostgreSQL

基本使用

import psycopg2

#1. 连接                                
try:
	pg_db = psycopg2.connect(database=database,
                                     user=username,
                                     password=password,
                                     host=host,
                                     port=port,
                                     options="-c search_path=" + schema)  # search_path 是schema

    pg_cursor = pg_db.cursor()
except Exception as e:
    pg_db.rollback()
    logger.exception(e)
    raise 
finally:
 	pg_cursor.close()
    pg_db.close()

#2.游标对象 
cur = conn.cursor()

#3.创建表
cur.execute("CREATE TABLE test (id serial PRIMARY KEY, num integer, data varchar);")

#4.带参插入
cur.execute("INSERT INTO test (num, data) VALUES (%s, %s)",
				(100, "abc'def"))


#5.查询
cur.execute("SELECT * FROM test;")
cur.fetchone()   #  (1, 100, "abc'def")
cur.fetchall()  #  [(1, 100, "abc'def"), (2, None, 'dada'), (3, 42, 'bar')]




#7.提交
conn.commit()

cur.close()
conn.close()

官方文档 https://www.psycopg.org/docs/usage.html

小技巧



# 批量插入
from psycopg2.extras import execute_batch
execute_batch(cur,
    "INSERT INTO test (id, v1, v2) VALUES(%s,%s,%s)",
    params_list) # [[1,2,3],[1,2,3]]  or [(1,2,3),(1,2,3)]

# 字典插入
cur.execute("INSERT INTO test (num, data) VALUES (%(arg1)s, %(arg2)s)",
				{"arg1":200, "arg2": "bcd"})


# 字典查询
dict_cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
dict_cur.execute("SELECT * FROM test")

#返回结果可做字典使用,可根据字段名取数据
rec = dict_cur.fetchone()
rec['id']  # 1
rec['num'] # 100

#也可以当做元祖
rec[1]  # 100

快速助手文档: https://www.psycopg.org/docs/extras.html

simplejson

基本使用

import simplejson

# 写入json文件
with f = open(path, 'w', encoding='UTF-8'):
	simplejson.dump(data_obj,f)

# 生成json字符串
json_str = simplejson.dumps(data_obj)

注意:

simplejson.dump(obj, fp, skipkeys=False, ensure_ascii=True, check_circular=True, allow_nan=True, cls=None, indent=None, separators=None, encoding='utf-8', default=None, 
use_decimal=True, namedtuple_as_object=True, 
tuple_as_array=True, bigint_as_string=False, 
sort_keys=False, item_sort_key=None, 
for_json=None, ignore_nan=False,
int_as_string_bitcount=None, iterable_as_array=False, **kw)

simplejson 只支持基本类型  (str, int, long, float, bool, None);
参数 use_decimal 默认为True,会自己解析 decimal.Decimal 类型;
参数 tuple_as_array 默认为True, tuple（和子类）将被编码为 JSON 数组;

其他类型的解析可通过参数 cls 来指定解析类,如下 :

# simplejson.JSONEncoder 是默认的解析类

class JsonFileTypeEncoder(simplejson.JSONEncoder):
	 def default(self, o):  # 解析方法
        if isinstance(o, tuple):
            return o[0].strftime("%Y-%m-%d %H:%M:%S")
        elif isinstance(o, datetime):
            return o.strftime("%Y-%m-%d")
        elif isinstance(o, timedelta):
            return str(o).split(".")[0]
        else:
            return simplejson.JSONEncoder.default(self, o)  # 不需要做处理的类型就还是用默认解析类的解析方法

#使用
simplejson.dumps(data_obj, cls=JsonFileTypeEncoder)

官方文档 https://simplejson.readthedocs.io/en/latest/

requests

官方文档: https://docs.python-requests.org/zh_CN/latest/user/quickstart.html
引用: https://www.liaoxuefeng.com/wiki/1016959663602400/1183249464292448#0

pip install requests

import requests

r = requests.get('https://www.baidu.com/') # 百度首页
r.status_code  # 200
r.text  #  '<!DOCTYPE HTML>\n<html>\n<head>\n<meta name="description" content="提供图书、电影、音乐唱片的推荐、评论和...'



# 返回结果
r.url # 实际请求的URL
r.encoding  #编码属性

# 无论响应是文本还是二进制内容，我们都可以用content属性获得bytes对象：
r.content 
# b'<!DOCTYPE html>\n<html>\n<head>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n...'

# 对于特定类型的响应，例如JSON，可以直接获取:
r = requests.get('https://query.yahooapis.com/v1/public/yql?q=select')
r.json()  # {'query': {'count': 1, 'created': '2017-11-17T07:14:12Z', ...


#传参
r = requests.get('https://www.douban.com/search', params={'q': 'python', 'cat': '1001'})
r.url # 'https://www.douban.com/search?q=python&cat=1001'

# 传入HTTP Header时，我们传入一个dict作为headers参数：
r = requests.get('https://www.douban.com/', headers={'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit'})
r.text
# '<!DOCTYPE html>\n<html>\n<head>\n<meta charset="UTF-8">\n <title>豆瓣(手机版)</title>...'


#post请求传参
r = requests.post('https://accounts.douban.com/login', data={'form_email': 'abc@example.com', 'form_password': '123456'})

#requests默认使用application/x-www-form-urlencoded对POST数据编码。如果要传递JSON数据，可以直接传入json参数：
params = {'key': 'value'}
r = requests.post(url, json=params) # 内部自动序列化为JSON


# 类似的，上传文件需要更复杂的编码格式，但是requests把它简化成files参数：
upload_files = {'file': open('report.xls', 'rb')}
r = requests.post(url, files=upload_files)
#在读取文件时，注意务必使用'rb'即二进制模式读取，这样获取的bytes长度才是文件的长度。

#响应头
r.headers  #{Content-Type': 'text/html; charset=utf-8', 'Transfer-Encoding': 'chunked', 'Content-Encoding': 'gzip', ...}
#请求头
r.request.headers #{Content-Type': 'text/html; charset=utf-8', 'Transfer-Encoding': 'chunked', 'Content-Encoding': 'gzip', ...}

#requests对Cookie做了特殊处理，使得我们不必解析Cookie就可以轻松获取指定的Cookie：
r.cookies['ts']  # 'example_cookie_12345'

# 要在请求中传入Cookie，只需准备一个dict传入cookies参数：
cs = {'token': '12345', 'status': 'working'}
r = requests.get(url, cookies=cs)


#最后，要指定超时，传入以秒为单位的timeout参数：
r = requests.get(url, timeout=2.5) # 2.5秒后超时

elasticsearch

pip3  install  elasticsearch 
API文档: https://elasticsearch-py.readthedocs.io/en/v8.0.0/api.html#module-elasticsearch

from elasticsearch import helpers, Elasticsearch

# 连接es
es_db = Elasticsearch(hosts="http://xxxx:9200", http_auth=(username, password), timeout=10)	


# 批量插入
# parallel_bulk 必须迭代插入结果才会执行
# chunk_size 一次性发送条数,默认500
for success, info in helpers.parallel_bulk(es_db, es_doc_list, chunk_size=5000):
     if not success:
         logger.error("es文档插入失败: %s ", info)

# 插入数据结构
es_doc_list =[ {
                    "_index": "",
                    "_id": "",
                    "_score": 100,
                    "_source": {"name":"" , "age": 10}
                }
               ]
               
#当没有 _source 字段时,会自动弹出元数据字段, 其他字段再作为文档字段,所以也可以如下:
es_doc_list =[ {
                    "_index": "",
                    "_id": "",
                    "_score": 100,
                    "name":"" , 
                    "age": 10
                }
               ]
# 官方文档中说也支持json字符串

es分页查询分三种方式:

From + Size 查询
Search After 查询

Scroll 查询

 From + Size
 最大查询条数为 max_result_window ,
 Elasticsearch 的 max_result_window 默认值是：10000。也就意味着：如果每页有 10 条数据，会最大翻页至 1000 页。

query_body = {"query":{ "match_all": {}}}

query = es.search(index="",
                  body=query_body ,
                  from_= 0,
                  size=100)

	Search After 查询
	单次请求值不能超过 max_result_window；但总翻页结果集可以超过。
	Search After 使用方法在7.10前后不同,7.10前没有pit这个快照点 , 这里的api是7.10之后的用法。
	7.10之前用法: https://www.elastic.co/guide/en/elasticsearch/reference/7.7/search-request-body.html#request-body-search-search-after

 # 1.创建 PIT 视图
        query = es.open_point_in_time(index='msdap_multisource',keep_alive='1m')
        pid = query["id"]

        # 2.创建基础查询
        query = es.search(size=10,query="",pit={"id":pid,"keep_alive": "1m"},sort=[{"id": "asc"}])
        result = query['hits']['hits']

        # 3. 翻页, 需要最后一个文档的sort值 例: "sort" : [ "200", 4 ]
        result_size = len(result)
        search_after = result[-1]["sort"]
        if result_size > 0 :
            query =  es.search(size=10,query="",pit={"id":pid,"keep_alive": "1m"},sort=[{"id": "asc"}],search_after=search_after)
            result_size = len(query['hits']['hits'])
            if result_size> 0:
                result.append(query['hits']['hits'])
                search_after = result[-1]["sort"]

Scroll 查询
单次遍历的 size 值也不能超过 max_result_window 大小。
官方文档强调：不再建议使用scroll API进行深度分页。如果要分页检索超过 Top 10,000+ 结果时，推荐使用：PIT + search_after。

query = es.search(index=self.database,
                              body=query_body,
                              request_timeout=100, scroll='1m')

sid = query['_scroll_id']
scroll_size = len(query['hits']['hits'])
result = query['hits']['hits']

while scroll_size > 0:
     data = es.scroll(scroll_id=sid, scroll='1m')
     scroll_size = len(data['hits']['hits'])
   
       if scroll_size > 0:
          result.append(data['hits']['hits'])