Python2
编码问题
字符串编码对齐 :Unicode字符串之间可以拼接;非Unicode字符串之间可以拼接
Unicode字符串编码为utf-8,显示中文:encode_str = unicode_str.encode('utf-8')
unicode_str = u'\u4e2d\u56fd'
print type ( unicode_str)
print unicode_str
encode_str = unicode_str. encode( 'utf-8' )
print type ( encode_str)
print encode_str
注意:py2的print函数可直接打印Unicode字符串为中文!因此在保存时,需要进行编码encode
!
Python3
小记
字符串
string[s_idx : e_idx],s_idx为负且e_idx为正,则取出空子串
断点
import pdb; pdb.set_trace()
随机
import random
random. seed( 0 )
def random_split_data ( data_list, ratio) :
val_nums = len ( data_list)
offset = int ( val_nums * ratio)
if val_nums == 0 or offset < 1 :
return [ ] , data_list
random. shuffle( data_list)
sublist_1 = data_list[ offset: ]
sublist_2 = data_list[ : offset]
return sublist_1, sublist_2
url_train, url_valid = data_split( list ( url_set) , ratio= 0.1 , shuffle= True )
a = list ( range ( 10 ) )
b = random. sample( a, k= 3 )
c = random. choices( a, k= 3 )
时间
print ( time. strftime( '%Y-%m-%d %H:%M:%S' , time. localtime( time. time( ) ) ) )
def print_time ( text) :
"""
打印当前时间及输入文本
"""
now = datetime. datetime. now( )
formatted_now = now. strftime( "%Y/%m/%d %H:%M:%S" )
content_to_print = "[%s]-%s" % ( formatted_now, text)
print ( content_to_print)
跳出双层循环
for else语法:必须在for循环里面添加break,否则会正常执行完所有流程 ,导致意外的结果
s = [ 11 , 2 , 3 ]
for i in s:
if i < 5 :
print ( 1 )
else :
print ( "s" )
JSON保存文件、JSON保存中文
Conda
异常处理
while True :
try :
num1 = int ( input ( '请输入一个数字:' ) )
num2 = int ( input ( '请输入一个数字:' ) )
division = num1 / num2
except ( ZeroDivisionError, ValueError) as e:
if isinstance ( e, ZeroDivisionError) :
print ( '程序出现了除以零错误' )
elif isinstance ( e, ValueError) :
print ( '程序输入类型错误' )
else :
print ( '两个数字相除等于{}' . format ( num1 / num2) )
break
打开文件与写入文件
import sys
import json
import math
import re
from tqdm import tqdm
j_ls = [ ]
prefix_path = '/mnt/data_hub/raw_data/'
open_file = prefix_path + sys. argv[ 1 ]
save_file = prefix_path + 'processed_data/' + sys. argv[ 2 ]
for line in tqdm( open ( open_file, 'r' ) ) :
j_ls. append( json. loads( line) )
def split_text ( text) :
text = text. strip( '。.!!??' )
text = text. replace( '...' , '' )
text = text. replace( ' ' , '' )
pattern = '(。|!|\!|\.|?|\?)'
text_splits = re. split( pattern, text)
text_splits = [ t for t in text_splits if t]
text_splits_droplast = text_splits[ : - 1 ]
sentences = text_splits_droplast[ : : 2 ]
punctuations = text_splits_droplast[ 1 : : 2 ]
if not text_splits or not sentences or not punctuations:
return [ ]
ls = [ ]
for i, j in zip ( sentences, punctuations) :
ls. append( i + j)
if text_splits[ - 1 ] != '' :
ls. append( text_splits[ - 1 ] + '。' )
return ls
def process_text ( j) :
ls = [ ]
text_ls = split_text( j[ 'text' ] [ 0 ] )
text_cnt = len ( text_ls)
prompt = '根据上下文情景,尽可能真实地、详细地补全下面一段文本的后续部分。\n文本:'
p = 0.3
split_num = text_cnt - math. ceil( text_cnt * p)
input_text = prompt + text_ls[ 0 ]
for text in text_ls[ 1 : split_num] :
input_text += text
output_text = text_ls[ split_num]
if split_num + 1 < text_cnt:
for text in text_ls[ split_num + 1 : ] :
output_text += text
ls. append(
{
'instruction' : input_text,
'response' : output_text,
'data_source' : j[ 'data_source' ] ,
'model_arch' : j[ 'model_arch' ] ,
'id' : j[ 'id' ]
}
)
return ls
i = 0
ls = [ ]
with open ( save_file, 'w' ) as f:
for json_data in tqdm( j_ls, desc= "formatting.." ) :
text_cnt = len ( json_data[ 'text' ] )
ls. append( text_cnt)
text = json_data[ 'text' ] [ 0 ]
if not text. strip( '。.!!??' ) or split_text( text) == [ ] :
continue
for sample in process_text( json_data) :
f. write( json. dumps( sample, ensure_ascii= False ) + '\n' )
print ( 'num of text_cnt == 1 is ' , i)
print ( 'min/max len of ls' , min ( ls) , max ( ls) )
环境、程序包
pip换源:pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
查看程序包版本:pip index versions xxx_package
chatgpt
import sys
import time
import json
import datetime
from multiprocessing import Pool
from tqdm import tqdm
import time
import json
import datetime
import requests
GPT_API = "https://aishopping.baidu-int.com/api/chatCompletion"
GPT_API_HEADERS = { 'Content-Type' : 'application/json' }
BS_NAME = 'dhuman'
GPT_API_TOKEN = '560d56ee-5e12-4f4a-8510-bf96f0bbe2c5'
SERVICE_ID = 15
import openai
openai. api_type = "azure"
openai. api_version = "2023-05-15"
openai. api_base = "https://azure-gpt-05.openai.azure.com/"
openai. api_key = "10caa809b8a4472ea0ae42bf89c8be8b"
def run_gpt ( messages, temperature, answer_max_token, use_default= False ) :
"""
调用GPT
"""
if use_default:
query_dict = {
"messages" : messages,
"max_tokens" : answer_max_token
}
else :
query_dict = {
"messages" : messages,
"max_tokens" : answer_max_token,
"temperature" : temperature
}
payload = json. dumps( {
"token" : GPT_API_TOKEN,
"serviceId" : SERVICE_ID,
"bussinessName" : BS_NAME,
"azureParam" : query_dict
} )
response = requests. request( "POST" , GPT_API, headers= GPT_API_HEADERS, data= payload)
json_data = response. json( )
answer = json_data[ "data" ] [ "answer" ]
return answer
def run_gpt_old ( messages, temperature, answer_max_token, use_default= False ) :
"""
调用GPT
"""
if use_default:
completion = openai. ChatCompletion. create(
engine= "azure-gpt-08" ,
max_tokens= answer_max_token,
messages= messages
)
else :
completion = openai. ChatCompletion. create(
engine= "azure-gpt-08" ,
temperature= temperature,
max_tokens= answer_max_token,
messages= messages
)
answer = completion. choices[ 0 ] [ "message" ] [ "content" ]
return answer
def request_gpt_once ( messages, temperature= 0.01 , answer_max_token= 1024 , sleep_time= 8 , max_retry= 15 , use_default= False ) :
"""
请求GPT
"""
retry = 0
while True :
if retry > max_retry:
return "[REQUEST GPT ERROR]"
try :
answer = run_gpt_old( messages, temperature, answer_max_token, use_default= use_default)
return answer
except Exception as e:
now = datetime. datetime. now( )
formatted_now = now. strftime( "%Y/%m/%d %H:%M:%S" )
print ( "[%s]-[Exception][%s]" % ( formatted_now, e) )
print ( "[%s]-[RETRY][ChatCompletion.create, sleeping %ss ...]" % ( formatted_now, sleep_time) )
time. sleep( sleep_time)
retry += 1
def gpt4_turbo_once_testing ( ) :
prompt = "你好,你是谁?"
system = ""
messages = [ ]
if system:
messages. append( { "role" : "system" , "content" : system} )
messages. append( { "role" : "user" , "content" : prompt} )
res = request_gpt_once( messages, answer_max_token= 4096 , use_default= True )
print ( "\n" * 1 )
print ( "========= GPT API Testing =========" )
print ( f"==> messages: { messages} " )
print ( f"==> GPT Response: { res} " )
print ( "===================================" )
print ( "\n" * 1 )
def gpt4_turbo_once_single_v2 ( ) :
prompt = "你是谁?"
system = ""
messages = [ ]
if system:
messages. append( { "role" : "system" , "content" : system} )
messages. append( { "role" : "user" , "content" : prompt} )
res = request_gpt_once( messages, answer_max_token= 4096 , use_default= True )
print ( "\n" * 1 )
print ( "========= GPT API Testing =========" )
print ( f"==> messages: { messages} " )
print ( f"==> GPT Response: { res} " )
print ( "===================================" )
print ( "\n" * 1 )
def gpt4_turbo_once_parallel ( dic) :
prompt = dic[ "prompt" ]
system = "system"
res = gpt4_turbo_once( prompt, system, temperature= 0.01 , answer_max_token= 2048 )
dic[ "gpt_res" ] = res
return dic
def call_gpt4_once_single ( ) :
temperature = 0.01
temperature_print = str ( temperature) . replace( "." , "" )
answer_max_token = 4096
data_list = [ ]
input_file = "data/zl_0511_30.jsonl"
with open ( input_file, "r" ) as f:
for line in f:
line = line. strip( "\n" )
dic = json. loads( line)
data_list. append( dic)
out_file = "res/gpt/" + "zl_0511_30.gpt_tempNo.jsonl"
out_file_f = open ( out_file, "w" )
s_time = time. perf_counter( )
for dic in tqdm( data_list[ : ] ) :
prompt = dic[ "prompt" ]
system = dic. get( "system" , "" )
prev_res = dic. get( "gpt_res" , "" )
if prev_res and "ERROR" not in prev_res:
out_file_f. write( f" { json. dumps( dic, ensure_ascii= False ) } \n" )
continue
messages = [ ]
if system:
messages. append( { "role" : "system" , "content" : system} )
messages. append( { "role" : "user" , "content" : prompt} )
if not prompt. strip( ) :
res = "[PROMPT IS EMPTY]"
else :
res = request_gpt_once( messages, answer_max_token= answer_max_token, use_default= True )
dic[ "gpt_res" ] = res
out_file_f. write( f" { json. dumps( dic, ensure_ascii= False ) } \n" )
out_file_f. close( )
e_time = time. perf_counter( )
exec_time = round ( e_time - s_time, 3 )
print ( f"程序执行时间: { exec_time} s" )
call_gpt4_once_single( )
def call_gpt4_2_rounds ( ) :
temperature = 0.8
answer_max_token = 4096
temperature_2nd = 0.5
answer_max_token_2nd = 4096
data_list = [ ]
input_file = "data/train/gen_script_1124.prompt.jsonl"
with open ( input_file, "r" ) as f:
for line in f:
line = line. strip( "\n" )
dic = json. loads( line)
data_list. append( dic)
temperature_pt = str ( temperature) . replace( "." , "" )
temperature_2nd_pt = str ( temperature_2nd) . replace( "." , "" )
out_file = "res/gpt/train/" + input_file. split( "/" ) [ - 1 ] . replace( ".jsonl" , "" ) + ".reflection.gpt_temp%s_temp2nd%s.jsonl" % ( temperature_pt, temperature_2nd_pt)
out_file_f = open ( out_file, "w" )
s_time = time. perf_counter( )
for dic in tqdm( data_list[ : ] ) :
prompt = dic[ "prompt" ]
system = dic[ "system" ]
messages = [ ]
if system:
messages. append( { "role" : "system" , "content" : system} )
messages. append( { "role" : "user" , "content" : prompt} )
if not prompt. strip( ) :
res = "[PROMPT IS EMPTY]"
else :
res = request_gpt_once( messages, temperature= temperature, answer_max_token= answer_max_token)
prompt_2nd = dic[ "prompt_2nd" ]
messages. append( { "role" : "assistant" , "content" : res} )
messages. append( { "role" : "user" , "content" : prompt_2nd} )
if not prompt_2nd. strip( ) :
res_2nd = "[2nd PROMPT IS EMPTY]"
else :
res_2nd = request_gpt_once( messages, temperature= temperature_2nd, answer_max_token= answer_max_token_2nd)
dic[ "gpt_res" ] = res
dic[ "gpt_res_2nd" ] = res_2nd
out_file_f. write( f" { json. dumps( dic, ensure_ascii= False ) } \n" )
out_file_f. close( )
e_time = time. perf_counter( )
exec_time = round ( e_time - s_time, 3 )
print ( f"多进程执行时间: { exec_time} s" )
import sys
import time
import json
import requests
import datetime
from multiprocessing import Pool
from tqdm import tqdm
def get_access_token ( ) :
api_key = 'xxx'
secret_key = 'yyy'
url = f"https://aip.bxxxxbce.com/oauth/2.0/token?grant_type=client_credentials&client_id= { api_key} &client_secret= { secret_key} "
payload = json. dumps( "" )
headers = {
'Content-Type' : 'application/json' ,
'Accept' : 'application/json'
}
response = requests. request( "POST" , url, headers= headers, data= payload)
return response. json( ) . get( "access_token" )
TEMPERATURE = 0.01
TOP_P = 0.85
WENXIN_MODEL_MAP = {
"eb3.5" : "https://aip.bxxxxbce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/completions" ,
"eb3.5_api2" : "https://aip.bxxxxbce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/completions_dev" ,
"eb4.0" : "https://aip.bxxxxbce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/completions_pro" ,
"eb4_api3" : "https://aip.bxxxxbce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/ernie-4.0-ecom-8k" ,
"api4" : "https://aip.bxxxxbce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/ernie-4.0-8k-preview"
}
WENXIN_TOKEN = "?access_token=" + get_access_token( )
def request_wenxin ( data, model_type, sleep_time= 10 , max_retry= 6 ) :
request_api = WENXIN_MODEL_MAP[ model_type] + WENXIN_TOKEN
headers = {
'Content-Type' : 'application/json'
}
retry = 0
while True :
if retry > max_retry:
print ( f"retry nums > { max_retry} , continue to next one ChatCompletion!" )
return "[CALL WENXIN `%s` API ERROR]" % model_type
try :
response = requests. request( "POST" , request_api, headers= headers, data= data)
json_data = response. json( )
return json_data[ "result" ]
except Exception as e:
now = datetime. datetime. now( )
formatted_now = now. strftime( "%Y/%m/%d %H:%M:%S" )
print ( "[%s]-[Exception][%s]" % ( formatted_now, e) )
print ( "[%s]-[RETRY][call wenxin api, sleeping %ss ...]" % ( formatted_now, sleep_time) )
time. sleep( sleep_time)
retry += 1
def call_wenxin_once ( prompt, system, model_type= "eb3.5" , temperature= 0.01 , top_p= 0.85 , answer_max_token= 1024 , use_default= False ) :
if not prompt. strip( ) :
return "[prompt IS EMPTY]"
messages = [ ]
messages. append( { "role" : "user" , "content" : prompt} )
if use_default:
data = json. dumps(
{ "messages" : messages, "system" : system if system else "" }
)
else :
data = json. dumps(
{ "messages" : messages, "system" : system if system else "" , "temperature" : temperature}
)
llm_answer = request_wenxin( data, model_type)
return llm_answer
def call_wenxin_2round ( dic, model_type= "eb3.5" , temperature= 0.01 , top_p= 0.85 , answer_max_token= 1024 ) :
system = dic. get( "system" , "" )
prompt = dic[ "prompt" ]
answer = dic[ "wx_res" ]
prompt_2 = dic[ "prompt_2" ]
if not prompt_2. strip( ) :
return "[CONTENT IS EMPTY]"
messages = [ ]
messages. append( { "role" : "user" , "content" : prompt} )
messages. append( { "role" : "assistant" , "content" : answer} )
messages. append( { "role" : "user" , "content" : prompt_2} )
data = json. dumps(
{ "messages" : messages, "system" : system if system else "" , "temperature" : temperature, "top_p" : top_p}
)
llm_answer = request_wenxin( data, model_type)
return llm_answer
def call_wenxin_once_testing ( ) :
wenxin_model_type = "api4"
prompt = "你好,你是谁?"
system = ""
res = call_wenxin_once( prompt, system, model_type= wenxin_model_type, temperature= TEMPERATURE, top_p= TOP_P)
print ( )
print ( "========= LLM API Testing =========" )
print ( f"==> Prompt: { prompt} " )
print ( f"==> LLM Version: { wenxin_model_type} " )
print ( f"==> LLM Response: { res} " )
print ( "===================================" )
print ( )
def call_wenxin_once_parallel ( dic) :
prompt = dic[ "prompt" ]
system = ""
res = call_wenxin_once( prompt, system, temperature= TEMPERATURE, top_p= TOP_P)
dic[ "gpt_res" ] = res
return dic
def call_wenxin_once_single ( ) :
data_list = [ ]
input_file = "data/wd_sft_qa_0613.jsonl"
with open ( input_file, "r" ) as f:
for line in f:
line = line. strip( )
dic = json. loads( line)
data_list. append( dic)
out_file = "res/wenxin/" + "wd_sft_qa_0613.wx_eb4_api3_tempNo.jsonl"
out_file_f = open ( out_file, "w" )
s_time = time. perf_counter( )
wenxin_model_type = "eb4.0"
for idx, dic in tqdm( enumerate ( data_list[ : ] ) ) :
prompt = dic[ "prompt" ]
system = dic. get( "system" , "" )
prev_res = dic. get( "wx_res" , "" )
dic[ "wenxin_model_type" ] = wenxin_model_type
if not prompt or ( prev_res and "API ERROR" not in prev_res) :
out_file_f. write( f" { json. dumps( dic, ensure_ascii= False ) } \n" )
continue
res = call_wenxin_once( prompt, system, model_type= wenxin_model_type, temperature= 0.01 )
dic[ "wx_res" ] = res
out_file_f. write( f" { json. dumps( dic, ensure_ascii= False ) } \n" )
out_file_f. close( )
e_time = time. perf_counter( )
exec_time = round ( e_time - s_time, 3 )
print ( f"进程执行时间: { exec_time} s" )
call_wenxin_once_single( )
Pandas
实现 SQL 中的 row_number 排序(分组排序、递增排序)
Jupyter Notebook
查看正在run的每个notebook
jupyter notebook list
:可以查看正在run的每个notebook,可以查询其token
新增其他的Python环境
pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
pip install ipykernel
python -m ipykernel install --name py39
pymysql
def get_mysql_db ( ) :
"""
获取MySQL数据库实例
"""
retry = 0
while retry < 30 :
try :
bns_instance = bns. BNS( MYSQL_BNS, auto_start= False ) . get_instance( )
break
except :
time. sleep( 0.5 )
retry += 1
ip = bns_instance[ 'ip' ]
port = int ( bns_instance[ 'port' ] )
db = pymysql. connect(
host= ip,
port= port,
user= 'MYSQL_USER' ,
password= 'MYSQL_PASSWORD' ,
database= 'MYSQL_DATABASE' ,
)
text = "[INFO][MySQL Database: `%s`]" % db
print_time( text)
return db
SQL_OP_SELECT = "SELECT id, spu_id, platform_id FROM %s WHERE status=0"
SQL_OP_SELECT_GOODS_INFO = "SELECT goods_id, data FROM %s WHERE goods_id=%s"
SQL_OP_UPDATE = "UPDATE %s SET status=1 WHERE id=%s"
SQL_OP_UPDATE_REQUEST_STATUS = "UPDATE %s SET %s=%s,%s=%s WHERE id=%s"
def mysql_db_select ( db, sql) :
"""
读取database
"""
try :
db. ping( )
cursor = db. cursor( )
cursor. execute( sql)
db. commit( )
fetch = cursor. fetchall( )
status = 1
except Exception as e:
db. rollback( )
fetch = ( )
status = 0
return fetch, status
def mysql_db_update ( db, sql) :
"""
更新database
"""
try :
db. ping( )
cursor = db. cursor( )
cursor. execute( sql)
db. commit( )
status = 1
except Exception as e:
db. rollback( )
status = 0
return status
Redis
def get_redis_client ( ) :
"""
获取Redis实例
"""
retry = 0
while retry < 30 :
try :
bns_instance = bns. BNS( REDIS_BNS, auto_start= False ) . get_instance( )
break
except :
time. sleep( 0.5 )
retry += 1
ip = bns_instance[ 'ip' ]
port = int ( bns_instance[ 'port' ] )
redis_client = redis. Redis( host= ip, port= port, db= REDIS_DATABASE_INDEX)
text = "[INFO][Redis Client: `%s`]" % redis_client
print_time( text)
return redis_client
def write_to_redis ( redis_client, redis_key, data= "{1: 2}" ) :
"""
把数据写入redis
"""
if isinstance ( data, str ) :
redis_client. set ( redis_key, data)
else :
redis_client. set ( redis_key, json. dumps( data, ensure_ascii= False ) )
redis_client. expire( redis_key, REDIS_KEY_EXPIRE_TIME)
llama factory
lsof /dev/nvidia* | awk '{print $2}' | tail -n +2 | sort | uniq | xargs -I { } kill -9 { }
stage = "pt"
finetuning_type = "full"
model_name_or_path = "/root/paddlejob/Baichuan2-7B-Chat/"
template = "baichuan2"
dataset_dir = "data/pretrain/0114"
dataset = "pt_data_0111.ml_5000"
dataset_save_path = "${dataset_dir} /processed"
save_model_name = "douyin.ml_1600.bc2_7b"
output_dir = "./ckpt/pretrain/${save_model_name} "
if [ ! -d "${output_dir} " ] ; then
mkdir -p ${output_dir}
fi
num_train_epochs = 2
per_device_train_batch_size = 4
gradient_accumulation_steps = 2
learning_rate = 3e-5
lr_scheduler_type = "cosine"
lora_target = "q_proj,v_proj"
logging_steps = 10
save_steps = 300
deepspeed_config_name_or_path = "./deepspeed_config/deepspeed_config_pretrain.json"
num_gpus = 8
master_port = 9903
deepspeed --num_gpus ${num_gpus} --master_port = ${master_port} src/train_bash.py \
--stage ${stage} \
--model_name_or_path ${model_name_or_path} \
--template ${template} \
--do_train \
--dataset_dir ${dataset_dir} \
--dataset ${dataset} \
--cache_path ${dataset_save_path} \
--finetuning_type ${finetuning_type} \
--lora_target ${lora_target} \
--output_dir ${output_dir} \
--overwrite_cache \
--per_device_train_batch_size ${per_device_train_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--lr_scheduler_type ${lr_scheduler_type} \
--logging_steps ${logging_steps} \
--save_steps ${save_steps} \
--learning_rate ${learning_rate} \
--num_train_epochs ${num_train_epochs} \
--plot_loss \
--fp16 \
--deepspeed ${deepspeed_config_name_or_path} \
--flash_attn ${flash_attn}