以下是pandas 个人使用过程的一些总结, 如果能给你提供帮助不胜荣幸
文章目录
数据库连接
连接 mysql, sqlserver
from sqlalchemy import create_engine
import pymssql
# mysql的
engine = create_engine("mysql+pymysql://用户名:密码@ip/库名?charset=编码")
# sql server
engine = create_engine('mssql+pymssql://用户名:密码@ip/库名',echo=True)
# 或者
section_conn = pymssql.connect(server="127.0.0.1", port=1433, user="用户名", password="密码",database="库名")
connection = engine.connect()
sql = """
select *
from
t_name
"""
df = pd.read_sql(sql, connection)
连接 elasticsearch
pipenv install elasticsearch
pipenv install pandasticsearch
# 没有使用过pipenv 可以使用 pip
pip install elasticsearch
pip install pandasticsearch
from elasticsearch import Elasticsearch, exceptions
pd.set_option('display.max_columns', None) # pandas 显示所有列
url = {"host": 'ip', "port":int('端口'), "timeout": 1500}
es = Elasticsearch([url], maxsize=25, http_compress=True) # maxsize = 最大线程数, http_compress=请求压缩, 大量数据貌似会快一点
from pandasticsearch import Select
result_dict = es.search(
index='索引名称',
body={"query": {"bool": {"must": [{"match_all": {}}], "must_not": [], "should": []}}, "size": 10000, # body就是 es 查询语句
"sort": [], "aggs": {}})
pandas_df = Select.from_dict(result_dict).to_pandas()
print(pandas_df.columns)
pandas 恢复功能
# 虽然是 notebook 的东西, 但是误删还是会出现的 这两个命令可以 打印以往的输入(当前 jupyter 进程不能关闭)
for line in locals()['In']:
print(line)
#或者, 直接的
history
pandas分组聚合
import requests
import pandas as pd
# 金融类数据集
url = 'http://finmindapi.servebeer.com/api/data'
form_data = {'dataset':'TaiwanStockPrice','stock_id':'2317'}
res = requests.post(url,verify = True,data = form_data)
temp = res.json()
data = pd.DataFrame(temp['data'])
data
date | stock_id | Trading_Volume | Trading_money | open | max | min | close | spread | Trading_turnover | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1992-01-04 | 2317 | 785000 | 47783000 | 62.0 | 62.0 | 60.0 | 61.0 | 0.0 | 237 |
1 | 1992-01-06 | 2317 | 570000 | 34496000 | 61.0 | 61.0 | 60.0 | 61.0 | 0.0 | 226 |
2 | 1992-01-07 | 2317 | 1484596 | 93646664 | 61.0 | 65.0 | 60.0 | 65.0 | 4.0 | 586 |
3 | 1992-01-08 | 2317 | 710798 | 45236173 | 65.0 | 65.0 | 62.5 | 64.5 | -0.5 | 363 |
4 | 1992-01-09 | 2317 | 685948 | 43533224 | 63.0 | 64.0 | 62.5 | 64.0 | -0.5 | 313 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
7208 | 2019-12-16 | 2317 | 34055107 | 3121729569 | 91.5 | 92.4 | 91.1 | 91.6 | 0.6 | 14433 |
7209 | 2019-12-17 | 2317 | 31166098 | 2846439695 | 91.8 | 91.9 | 91.1 | 91.6 | 0.0 | 12212 |
7210 | 2019-12-18 | 2317 | 58840462 | 5427958104 | 91.6 | 92.8 | 91.1 | 92.4 | 0.8 | 24632 |
7211 | 2019-12-19 | 2317 | 28904382 | 2653486845 | 92.0 | 92.2 | 91.3 | 92.2 | -0.2 | 11949 |
7212 | 2019-12-20 | 2317 | 38237000 | 3498154100 | 92.5 | 92.6 | 91.1 | 91.1 | -1.1 | 9267 |
7213 rows × 10 columns
# pandas 分组聚合 生成新的 df
data.groupby(['Trading_turnover']).size().reset_index(name='counts')
Trading_turnover | counts | |
---|---|---|
0 | 0 | 1 |
1 | 10 | 1 |
2 | 100 | 3 |
3 | 10006 | 1 |
4 | 10011 | 1 |
... | ... | ... |
5532 | 999 | 1 |
5533 | 9991 | 1 |
5534 | 9994 | 1 |
5535 | 9997 | 1 |
5536 | 9999 | 2 |
5537 rows × 2 columns
```pandas 去重
# pandas 去重
# drop_duplicates(['name1', 'name2']) 可以指定多列
df = data.drop_duplicates(['spread'])
df
date | stock_id | Trading_Volume | Trading_money | open | max | min | close | spread | Trading_turnover | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1992-01-04 | 2317 | 785000 | 47783000 | 62.0 | 62.0 | 60.0 | 61.0 | 0.0 | 237 |
2 | 1992-01-07 | 2317 | 1484596 | 93646664 | 61.0 | 65.0 | 60.0 | 65.0 | 4.0 | 586 |
3 | 1992-01-08 | 2317 | 710798 | 45236173 | 65.0 | 65.0 | 62.5 | 64.5 | -0.5 | 363 |
5 | 1992-01-10 | 2317 | 1098914 | 70909168 | 64.0 | 66.0 | 63.0 | 65.0 | 1.0 | 453 |
8 | 1992-01-14 | 2317 | 550302 | 37054979 | 66.5 | 68.0 | 66.0 | 67.5 | 1.5 | 261 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
5623 | 2013-06-26 | 2317 | 57275136 | 4138658123 | 72.3 | 73.2 | 71.2 | 73.0 | 2.8 | 24726 |
6156 | 2015-08-24 | 2317 | 88531178 | 7213060618 | 83.0 | 84.9 | 77.9 | 81.6 | -3.9 | 37966 |
6157 | 2015-08-25 | 2317 | 60830475 | 5087320875 | 81.0 | 85.0 | 81.0 | 85.0 | 3.4 | 24890 |
6161 | 2015-08-31 | 2317 | 62543286 | 5657867812 | 89.5 | 92.5 | 88.8 | 92.5 | 3.9 | 19711 |
7032 | 2019-04-01 | 2317 | 171442589 | 13641328837 | 77.0 | 80.8 | 76.7 | 80.8 | 7.3 | 66838 |
159 rows × 10 columns
pandas 导出 excel 时更换引擎
# 一般来说导出 excel 直接使用 to_excel 但是导出大量数据时会有问题,并且 excel 中的链接也会消失
# pandas 更换 excel 引擎 (导出数据没有上限, 并且 url 转 str 类型)
# options={'strings_to_urls': False} 参数表格表示 url 不转超链接 导出大量数据时一定要加此参数
writer = pd.ExcelWriter('./excel_name.xlsx',engine='xlsxwriter', options={'strings_to_urls': False})
# sheet1
data.to_excel(writer,'sheet 名称',index=False)
# sheet2
data.to_excel(writer,'sheet 名称2',index=False)
writer.save()
apply 函数
# 要对每行, 每列进行操作时
def splice_url(data):
if data['name']:
# 这里对值进行操作
return data['name']
# axis =1, 对每行进行操作, axis =0 对每列
section_info['name'] = section_info.apply(splice_url,axis=1)