import pandas as pd import numpy as np import missingno as mso from pandas import Series,DataFrame import pymongo import pymysql import time #连接数据库 def select_mysql(): db = pymysql.connect(host='localhost', port=3306, user='root', password='123456', db='test', charset='utf8') cur = db.cursor() sql = "select cid from spider_category_task " # sql = "select cid from spider_category_task where cid = 110205"#测试 cur.execute(sql) cid_list = cur.fetchall() db.close() for cid in cid_list: cid =cid[0] find_pymongo_data(cid) def find_pymongo_data(category): a = 25 ''' 传入一个类目名称,从mongo里面查出该类目的所有数据,并把 它转换为pandas里面的DataFrame对象 :param key_word: 类目名称 :return: ''' mongo_user = "root" mongo_pwd = "123456*" client = pymongo.MongoClient(host = 'localhost',port = 27017 ) #数据库登录需要账号密码的话 client.test.authenticate(mongo_user,mongo_pwd) # test代表数据库名称 db = client['test']#获得数据库的句柄 coll = db['20181022']#获得collection的句柄 coll.create_index('category')#添加索引 #加载数据 # data = DataFrame(list(coll.find({'key_word':key_word})))#根据搜索的关键词拿出数据 mongo_data = DataFrame(list(coll.find({'category':category}))) client.close() try: if len(mongo_data) > 0:#查询结果不为空执行 mongo_data = mongo_data[['price','sold']]#取里面的价格和销量 data = mongo_data[~mongo_data['sold'].isin([''])]#取出里面的价格不为空的数据 print(data) data = data.astype({'price': 'float', 'sold': 'int'})#价格转为浮点型.销量转为整数型 splite_price(data,category,a) except Exception as e: pass def splite_price(data,category, a): ''' :param data: 从mongo里面获取DataFrame对象的数据 :return: ''' try: data['group'] = pd.qcut(data.price,a)#价格切割为25段,如果切不了,依次减一,递归继续切 # 求每个价格分段对应的sales的和 datas = data[['sold', 'group']].groupby('group').sum().reset_index() except Exception as e: if a > 0: a -= 1 splite_price(data,category, a) else: print('一段也切不了') # print((data['group']))#二位数组 price_group =datas['group'] sold_group = datas['sold'] price_group_list = [] for i in price_group: i = (str(i)).replace('(','') i = i.replace(']','') price_list = i.split(',') price_group_list.append(price_list) # low_price = price_list[0] # high_price =price_list[1] data_list = zip(price_group_list,sold_group)#打包,两个列表合为一个列表(数据一一对应) data_list = (list(data_list)) print(data_list) # insert_mysql(data_list,category) # def insert_mysql(data_list,category): ''' 插入到数据库里面 :param data_list: :param category: :return: ''' # print(data_list) pass datetime = time.strftime('%Y%m%d') # data_list = splite_price() db = pymysql.connect(host = 'localhost',port = 3306,user = 'test',password = 'root*',db = 'test',charset = 'utf8') cur = db.cursor() for data in data_list: min_price = data[0][0] max_price = data[0][1] min_price = float(min_price) max_price = float(max_price) sold = data[1] # print(sold) sql = "insert into price_break_test (min_price,max_price,cate_id,sold,date_time) values(%s,%s,'%s','%s','%s')"%(min_price,max_price,category,sold,datetime) cur.execute(sql) db.commit() db.close() select_mysql()
pandas切割mongodb数据
最新推荐文章于 2022-05-21 15:58:55 发布