2021.07.22
-
pyspark 获取sql数据
import os, re, json, time, datetime from tqdm import tqdm import pandas as pd import numpy as np from collections import Counter from pyspark import SparkConf from pyspark.sql import SparkSession conf = SparkConf().setMaster('yarn') \ .set('spark.yarn.queue', 'root.zw01.hadoop-grocery.etltest') \ .set('spark.executor.memory', '15G') \ .set("spark.driver.memory", "60g") # 增加jvm的内存 spark = SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate() def read_hive_by_sql(sql, tag): ''' 读取hive表数据 ''' tmpDf = spark.sql( sql ) tmpDf = pd.DataFrame( tmpDf.collect(), columns=tmpDf.columns ) print ( '{0}->样本数:{1}'.format(tag, tmpDf.shape[0]) ) return tmpDf sql = ''' select normalized_name, category_id, picture_urls from mart_grocery_udc.dim_standard_sku_all where normalized_name like '%金装卷纸%' ''' read_hive_by_sql(sql,'search')
-
复制文件夹
import os, shutil os.getcwd() shutil.copytree('./v1_torch/', '/home/hadoop-datamining/cephfs/data/zhanghaozhou/v1_torch')
2021.7.23
-
pandas取两个表里的不同行:
def anti_join(x, y, on): """ :param x: :param y: :param on:如没有特殊需求,可以不要这个参数 :return: 返回x中不包含y的部分 """ ans = pd.merge(left=x, right=y, how='left', indicator=True, on=on) ans = ans.loc[ans._merge == 'left_only', :].drop(columns='_merge') return ans
-
取相同行:
s1 = pd.merge(df1, df2, how='inner', on=['userId', 'movieId'])
-
将Series转换成DF
import numpy as np import pandas as pd # 将Series转换为DataFrame data = pd.Series(np.random.randn(10)*500+1000, index=['A37','A50','R7S','Note5', 'G7','R9_Plus','5C','X5_Pro','MX3','M5']) df = pd.DataFrame({'Product_Name':data.index, 'Price':data.values}) print(df)