pandas 常用

以下是pandas 个人使用过程的一些总结, 如果能给你提供帮助不胜荣幸

数据库连接

连接 mysql, sqlserver

from sqlalchemy import create_engine
import pymssql
# mysql的
engine = create_engine("mysql+pymysql://用户名:密码@ip/库名?charset=编码")
# sql server
engine = create_engine('mssql+pymssql://用户名:密码@ip/库名',echo=True)
# 或者
section_conn = pymssql.connect(server="127.0.0.1", port=1433, user="用户名", password="密码",database="库名")
connection = engine.connect()
sql = """
     select * 
from 
    t_name
"""
df = pd.read_sql(sql, connection)

连接 elasticsearch

pipenv install elasticsearch
pipenv install pandasticsearch
# 没有使用过pipenv 可以使用 pip
pip install elasticsearch
pip install pandasticsearch

from elasticsearch import Elasticsearch, exceptions

pd.set_option('display.max_columns', None) # pandas 显示所有列
url = {"host": 'ip', "port":int('端口'), "timeout": 1500}
es = Elasticsearch([url], maxsize=25, http_compress=True) # maxsize = 最大线程数, http_compress=请求压缩, 大量数据貌似会快一点
from pandasticsearch import Select
result_dict = es.search(
    index='索引名称',
    body={"query": {"bool": {"must": [{"match_all": {}}], "must_not": [], "should": []}}, "size": 10000, # body就是 es 查询语句
          "sort": [], "aggs": {}})
pandas_df = Select.from_dict(result_dict).to_pandas() 
print(pandas_df.columns)

pandas 恢复功能

# 虽然是 notebook 的东西, 但是误删还是会出现的 这两个命令可以 打印以往的输入(当前 jupyter 进程不能关闭)
for line in locals()['In']:
    print(line)
#或者, 直接的
history

pandas分组聚合

import requests
import pandas as pd 

# 金融类数据集
url = 'http://finmindapi.servebeer.com/api/data'
form_data = {'dataset':'TaiwanStockPrice','stock_id':'2317'}
res = requests.post(url,verify = True,data = form_data)

temp = res.json()
data = pd.DataFrame(temp['data'])
data
datestock_idTrading_VolumeTrading_moneyopenmaxminclosespreadTrading_turnover
01992-01-0423177850004778300062.062.060.061.00.0237
11992-01-0623175700003449600061.061.060.061.00.0226
21992-01-07231714845969364666461.065.060.065.04.0586
31992-01-0823177107984523617365.065.062.564.5-0.5363
41992-01-0923176859484353322463.064.062.564.0-0.5313
.................................
72082019-12-16231734055107312172956991.592.491.191.60.614433
72092019-12-17231731166098284643969591.891.991.191.60.012212
72102019-12-18231758840462542795810491.692.891.192.40.824632
72112019-12-19231728904382265348684592.092.291.392.2-0.211949
72122019-12-20231738237000349815410092.592.691.191.1-1.19267

7213 rows × 10 columns

# pandas 分组聚合 生成新的 df
data.groupby(['Trading_turnover']).size().reset_index(name='counts')
Trading_turnovercounts
001
1101
21003
3100061
4100111
.........
55329991
553399911
553499941
553599971
553699992

5537 rows × 2 columns

```

pandas 去重

# pandas 去重
# drop_duplicates(['name1', 'name2']) 可以指定多列
df = data.drop_duplicates(['spread'])
df
datestock_idTrading_VolumeTrading_moneyopenmaxminclosespreadTrading_turnover
01992-01-0423177850004778300062.062.060.061.00.0237
21992-01-07231714845969364666461.065.060.065.04.0586
31992-01-0823177107984523617365.065.062.564.5-0.5363
51992-01-10231710989147090916864.066.063.065.01.0453
81992-01-1423175503023705497966.568.066.067.51.5261
.................................
56232013-06-26231757275136413865812372.373.271.273.02.824726
61562015-08-24231788531178721306061883.084.977.981.6-3.937966
61572015-08-25231760830475508732087581.085.081.085.03.424890
61612015-08-31231762543286565786781289.592.588.892.53.919711
70322019-04-0123171714425891364132883777.080.876.780.87.366838

159 rows × 10 columns

pandas 导出 excel 时更换引擎

# 一般来说导出 excel 直接使用 to_excel 但是导出大量数据时会有问题,并且 excel 中的链接也会消失
# pandas 更换 excel 引擎 (导出数据没有上限, 并且 url 转 str 类型)
#  options={'strings_to_urls': False} 参数表格表示  url 不转超链接 导出大量数据时一定要加此参数
writer = pd.ExcelWriter('./excel_name.xlsx',engine='xlsxwriter', options={'strings_to_urls': False})
# sheet1
data.to_excel(writer,'sheet 名称',index=False)
# sheet2
data.to_excel(writer,'sheet 名称2',index=False)
writer.save()

apply 函数

# 要对每行, 每列进行操作时
def splice_url(data):
    if  data['name']:
        # 这里对值进行操作
        return data['name']
# axis =1, 对每行进行操作, axis =0 对每列
section_info['name'] = section_info.apply(splice_url,axis=1)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值