Python2/3 使用小记

Python2

编码问题

  1. 字符串编码对齐:Unicode字符串之间可以拼接;非Unicode字符串之间可以拼接
    • 字符串前添加u,转换为Unicode字符串
  2. Unicode字符串编码为utf-8,显示中文:encode_str = unicode_str.encode('utf-8')
    unicode_str = u'\u4e2d\u56fd'
    print type(unicode_str) # <type 'unicode'>
    print unicode_str # 中国
    
    encode_str = unicode_str.encode('utf-8')
    print type(encode_str) # <type 'str'>
    print encode_str # 中国
    
  3. 注意:py2的print函数可直接打印Unicode字符串为中文!因此在保存时,需要进行编码encode

Python3

小记

  1. 字符串
    • string[s_idx : e_idx],s_idx为负且e_idx为正,则取出空子串

断点

import pdb; pdb.set_trace()

随机

  • 列表元素随机分为两部分
import random

random.seed(0)
 
def random_split_data(data_list, ratio):
    val_nums = len(data_list)
    offset = int(val_nums * ratio)
    
    if val_nums == 0 or offset < 1:
        return [], data_list
    
    random.shuffle(data_list)
    sublist_1 = data_list[offset:] # train
    sublist_2 = data_list[:offset] # valid
    
    return sublist_1, sublist_2

url_train, url_valid = data_split(list(url_set), ratio=0.1, shuffle=True)
  • 从列表中随机取出 k 个元素
a = list(range(10))

# 无重复取
b = random.sample(a, k=3)

# 有重复取
c = random.choices(a, k=3)

时间

print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))

def print_time(text):
    """
    打印当前时间及输入文本
    """

    now = datetime.datetime.now()
    formatted_now = now.strftime("%Y/%m/%d %H:%M:%S")

    content_to_print = "[%s]-%s" % (formatted_now, text)
    print(content_to_print)

跳出双层循环

  • for else语法:必须在for循环里面添加break,否则会正常执行完所有流程,导致意外的结果
s = [11, 2, 3]

for i in s:
    if i < 5:
        print(1)
else:
    print("s") 

JSON保存文件、JSON保存中文

Conda

异常处理

while True :
    try: 
    	# 尝试执行的code
    	num1 = int(input('请输入一个数字:'))
        num2 = int(input('请输入一个数字:'))
        division = num1 / num2
    except (ZeroDivisionError, ValueError) as e:
    	# 异常处理逻辑
        if isinstance(e, ZeroDivisionError):  # 判断e属于什么类的实例
            print('程序出现了除以零错误')
        elif isinstance(e, ValueError):
            print('程序输入类型错误')
    else: 
    	# 未产生错误则执行
        print('两个数字相除等于{}'.format(num1 / num2))
        break # 在function里面也可以用return直接返回值

打开文件与写入文件

  • 一个示例
import sys
import json
import math
import re
from tqdm import tqdm

j_ls = []

prefix_path = '/mnt/data_hub/raw_data/'
open_file = prefix_path + sys.argv[1]
save_file = prefix_path + 'processed_data/' + sys.argv[2]

for line in tqdm(open(open_file, 'r')):
    j_ls.append(json.loads(line))

def split_text(text):
    text = text.strip('。.!!??')
    text = text.replace('...', '')
    text = text.replace(' ', '')

    pattern = '(。|!|\!|\.|?|\?)'
    text_splits = re.split(pattern, text)
    text_splits = [t for t in text_splits if t]

    text_splits_droplast = text_splits[:-1]
    sentences = text_splits_droplast[::2]
    punctuations = text_splits_droplast[1::2]

    if not text_splits or not sentences or not punctuations:
        return []

    ls = []
    for i, j in zip(sentences, punctuations):
        ls.append(i + j)
    if text_splits[-1] != '':
        ls.append(text_splits[-1] + '。')
    
    return ls
    
def process_text(j):
    ls = []
    text_ls = split_text(j['text'][0])
    text_cnt = len(text_ls)
    
    prompt = '根据上下文情景,尽可能真实地、详细地补全下面一段文本的后续部分。\n文本:'
    
    p = 0.3
    split_num = text_cnt - math.ceil(text_cnt * p)

    input_text = prompt + text_ls[0]
    for text in text_ls[1 : split_num]:
        input_text += text
    
    output_text = text_ls[split_num]
    if split_num + 1 < text_cnt: 
        for text in text_ls[split_num + 1 : ]:
            output_text += text
            
    ls.append(
        {
            'instruction': input_text,
            'response': output_text,
            'data_source': j['data_source'],
            'model_arch': j['model_arch'],
            'id': j['id']
        }
    )
            
    return ls

i = 0
ls = []
with open(save_file, 'w') as f:
    for json_data in tqdm(j_ls, desc="formatting.."):
        text_cnt = len(json_data['text'])
        ls.append(text_cnt)
        text = json_data['text'][0]
        if not text.strip('。.!!??') or split_text(text) == []:
            continue
        for sample in process_text(json_data):
            f.write(json.dumps(sample, ensure_ascii=False) + '\n')

print('num of text_cnt == 1 is ', i)
print('min/max len of ls', min(ls), max(ls))

环境、程序包

  • pip换源:pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
  • 查看程序包版本:pip index versions xxx_package

chatgpt

### 微软 GPT4 服务
import sys
import time
import json
import datetime

from multiprocessing import Pool

from tqdm import tqdm

import time
import json
import datetime
import requests


# GPT接口配置
GPT_API = "https://aishopping.baidu-int.com/api/chatCompletion"
GPT_API_HEADERS = {'Content-Type': 'application/json'}
BS_NAME = 'dhuman'
# GPT_API_TOKEN = 'b29c2955-9f69-439d-b8db-320696d7c19a'
# SERVICE_ID = 14

GPT_API_TOKEN = '560d56ee-5e12-4f4a-8510-bf96f0bbe2c5'
SERVICE_ID = 15

# # openai 配置(已失效)
# import openai
# # openai.api_type = "azure"
# # openai.api_base = "https://azure-gpt-01.openai.azure.com/"
# # openai.api_version = "2023-05-15"
# # openai.api_key = "344144ae1e264137898b67fead8c3e25"

import openai
# engine="azure-gpt-07"
openai.api_type = "azure"
# openai.api_base = "https://azure-gpt-04.openai.azure.com/"
# openai.api_version = "2023-05-15"
# openai.api_key = "c23bd9e110fe4936bb9f2f2e6d635010"

openai.api_version = "2023-05-15"
openai.api_base = "https://azure-gpt-05.openai.azure.com/"
openai.api_key = "10caa809b8a4472ea0ae42bf89c8be8b"


def run_gpt(messages, temperature, answer_max_token, use_default=False):    
    """
    调用GPT
    """

    if use_default:
        query_dict = {
            "messages": messages,
            "max_tokens": answer_max_token
        }
    else:
        query_dict = {
            "messages": messages,
            "max_tokens": answer_max_token,
            "temperature": temperature
        }
    
    payload = json.dumps({
        "token": GPT_API_TOKEN,
        "serviceId": SERVICE_ID,
        "bussinessName": BS_NAME,
        "azureParam": query_dict
    })

    response = requests.request("POST", GPT_API, headers=GPT_API_HEADERS, data=payload)
    json_data = response.json()
    answer = json_data["data"]["answer"]

    return answer


def run_gpt_old(messages, temperature, answer_max_token, use_default=False):    
    """
    调用GPT
    """

    if use_default:
        completion = openai.ChatCompletion.create(
            # model="gpt-4",
            # engine="azure-gpt-01",
            # engine="azure-gpt-07",
            engine="azure-gpt-08",
            max_tokens=answer_max_token,  # max token size of anwser
            messages=messages
        )
    else:
        completion = openai.ChatCompletion.create(
            # model="gpt-4",
            # engine="azure-gpt-01",
            # engine="azure-gpt-07",
            engine="azure-gpt-08",
            temperature=temperature,
            max_tokens=answer_max_token,  # max token size of anwser
            messages=messages
        )

    answer = completion.choices[0]["message"]["content"]

    return answer


def request_gpt_once(messages, temperature=0.01, answer_max_token=1024, sleep_time=8, max_retry=15, use_default=False):
    """
    请求GPT
    """
    
    retry = 0
    while True:
        
        if retry > max_retry:
            # print("retry > 20, continue to next one")
            return "[REQUEST GPT ERROR]"
        
        try:
            # answer = run_gpt(messages, temperature, answer_max_token, use_default=False)
            answer = run_gpt_old(messages, temperature, answer_max_token, use_default=use_default)
            return answer

        except Exception as e:
            
            now = datetime.datetime.now()
            formatted_now = now.strftime("%Y/%m/%d %H:%M:%S")
            print("[%s]-[Exception][%s]" % (formatted_now, e))
            print("[%s]-[RETRY][ChatCompletion.create, sleeping %ss ...]" % (formatted_now, sleep_time))

            time.sleep(sleep_time)

            retry += 1


def gpt4_turbo_once_testing():

    # 配置messages
    prompt = "你好,你是谁?"
    system = ""

    messages = []
    if system:
        messages.append({"role": "system", "content": system})
    messages.append({"role": "user", "content": prompt})

    # res = request_gpt_once(messages, temperature=0.01, answer_max_token=4096)
    res = request_gpt_once(messages, answer_max_token=4096, use_default=True)

    print("\n" * 1)
    print("========= GPT API Testing =========")
    print(f"==> messages: {messages}")
    print(f"==> GPT Response: {res}")
    print("===================================")
    print("\n" * 1)


def gpt4_turbo_once_single_v2():

    # 配置messages
    # prompt = "你好,你是谁?"
    # system = ""

    prompt = "你是谁?"
    system = ""

    messages = []
    if system:
        messages.append({"role": "system", "content": system})
    messages.append({"role": "user", "content": prompt})

    # res = request_gpt_once(messages, temperature=0.01, answer_max_token=4096)
    res = request_gpt_once(messages, answer_max_token=4096, use_default=True)

    print("\n" * 1)
    print("========= GPT API Testing =========")
    print(f"==> messages: {messages}")
    print(f"==> GPT Response: {res}")
    print("===================================")
    print("\n" * 1)


def gpt4_turbo_once_parallel(dic):
    prompt = dic["prompt"]
    # system = dic["system"]
    system = "system"

    res = gpt4_turbo_once(prompt, system, temperature=0.01, answer_max_token=2048)
    dic["gpt_res"] = res

    return dic


### 0. 测试 GPT API 
##########################################################################################

# gpt4_turbo_once_testing()

##########################################################################################



### 1. 单进程调用
##########################################################################################

def call_gpt4_once_single():

    # 参数
    temperature = 0.01
    temperature_print = str(temperature).replace(".", "")

    answer_max_token = 4096

    # 输入
    data_list = []

    # input_file = "data/ocr_test_0202_p2.jsonl"
    # input_file = "data/gen_script_1117.prompt.jsonl"
    input_file = "data/zl_0511_30.jsonl"
    # input_file = "data/wenda_ocr_xmx_0411.jsonl"
    # input_file = "data/0104_data2.jsonl"
    # input_file = "res/gpt/bq_0313.gpt_tempNo.jsonl"
    with open(input_file, "r") as f:
        for line in f:
            line = line.strip("\n")
            dic = json.loads(line)
            data_list.append(dic)

    # 输出
    # out_file = "res/gpt/" + input_file.split("/")[-1].replace(".jsonl", "") + ".gpt_temp%s.jsonl" % temperature_print
    # out_file = "res/gpt/" + input_file.split("/")[-1].replace(".jsonl", "") + ".gpt_tempNo.jsonl"
    # out_file = "res/gpt/" + "keyi_0129.gpt_tempNo.jsonl"
    out_file = "res/gpt/" + "zl_0511_30.gpt_tempNo.jsonl"
    # out_file = "res/gpt/" + "bq_0315.gpt_tempNo.jsonl"
    # out_file = "res/gpt/" + "wenda_ocr_xmx_0411.gpt_temp001.jsonl"
    out_file_f = open(out_file, "w")

    s_time = time.perf_counter()

    for dic in tqdm(data_list[:]):
        
        prompt = dic["prompt"]
        system = dic.get("system", "")
        prev_res = dic.get("gpt_res", "")

        if prev_res and "ERROR" not in prev_res:
            out_file_f.write(f"{json.dumps(dic, ensure_ascii=False)}\n")
            continue

        messages = []
        if system:
            messages.append({"role": "system", "content": system})
        messages.append({"role": "user", "content": prompt})

        if not prompt.strip():
            res = "[PROMPT IS EMPTY]"
        else:
            # res = request_gpt_once(messages, temperature=0.01, answer_max_token=answer_max_token)
            res = request_gpt_once(messages, answer_max_token=answer_max_token, use_default=True)
        
        dic["gpt_res"] = res

        out_file_f.write(f"{json.dumps(dic, ensure_ascii=False)}\n")

    out_file_f.close()

    e_time = time.perf_counter()
    exec_time = round(e_time - s_time, 3)
    print(f"程序执行时间: {exec_time}s")


call_gpt4_once_single()

##########################################################################################



### 1. 单进程调用(2轮)
##########################################################################################

def call_gpt4_2_rounds():

    # 第一轮参数
    temperature = 0.8
    answer_max_token = 4096

    # 第二轮参数
    temperature_2nd = 0.5
    answer_max_token_2nd = 4096

    # 输入
    data_list = []

    input_file = "data/train/gen_script_1124.prompt.jsonl"
    with open(input_file, "r") as f:
        for line in f:
            line = line.strip("\n")
            dic = json.loads(line)
            data_list.append(dic)

    # 输出
    temperature_pt = str(temperature).replace(".", "")
    temperature_2nd_pt = str(temperature_2nd).replace(".", "")

    out_file = "res/gpt/train/" + input_file.split("/")[-1].replace(".jsonl", "") + ".reflection.gpt_temp%s_temp2nd%s.jsonl" % (temperature_pt, temperature_2nd_pt)
    # out_file = "res/gpt/train/" + input_file.split("/")[-1].replace(".jsonl", "") + ".gpt_tempNo.jsonl"
    out_file_f = open(out_file, "w")

    s_time = time.perf_counter()

    for dic in tqdm(data_list[:]):

        # 第一轮
        prompt = dic["prompt"]
        system = dic["system"]

        messages = []
        if system:
            messages.append({"role": "system", "content": system})
        messages.append({"role": "user", "content": prompt})

        if not prompt.strip():
            res = "[PROMPT IS EMPTY]"
        else:
            res = request_gpt_once(messages, temperature=temperature, answer_max_token=answer_max_token)

        # 第二轮
        prompt_2nd = dic["prompt_2nd"]

        messages.append({"role": "assistant", "content": res})
        messages.append({"role": "user", "content": prompt_2nd})

        if not prompt_2nd.strip():
            res_2nd = "[2nd PROMPT IS EMPTY]"
        else:
            res_2nd = request_gpt_once(messages, temperature=temperature_2nd, answer_max_token=answer_max_token_2nd)

        dic["gpt_res"] = res
        dic["gpt_res_2nd"] = res_2nd

        out_file_f.write(f"{json.dumps(dic, ensure_ascii=False)}\n")

    out_file_f.close()

    e_time = time.perf_counter()
    exec_time = round(e_time - s_time, 3)
    print(f"多进程执行时间: {exec_time}s")


# call_gpt4_2_rounds()

##########################################################################################



### 2. 多进程调用
##########################################################################################

# data_list = []

# input_file = "data/gpt.shaoqi_1103.jsonl"
# with open(input_file, "r") as f:
#     for line in f:
#         line = line.strip("\n")
#         dic = json.loads(line)
#         data_list.append(dic)


# s_time = time.perf_counter()

# POOL_SIZE = 10
# with Pool(POOL_SIZE) as pool:
#     result_list = pool.map(gpt4_turbo_once_parallel, data_list)
# pool.close()
# pool.join()

# e_time = time.perf_counter()
# exec_time = round(e_time - s_time, 3)
# print(f"多进程执行时间: {exec_time}s")


# ### 程序执行完成后保存结果
# # output_file_final = input_file.replace(".jsonl", "") + ".gpt_res.jsonl"
# output_file_final = input_file.replace(".jsonl", "") + ".gpt_res_with_system.jsonl"
# with open(output_file_final, "w") as f:
#     for dic in result_list:
#         f.write(f"{json.dumps(dic, ensure_ascii=False)}\n")

##########################################################################################



### 3. 单进程调用 + 反思
##########################################################################################

# SYSTEM_REF = """你是一位擅长编写电商直播剧本的专业编剧。给你一篇由大模型生成的电商直播剧本,结合下面的Checklist(以```包裹)逐条分析,对该剧本进行修订,返回修订后的剧本。

# ```
# 1. 检查剧本的返回格式
#     - 剧本的返回格式是否满足:只有正文格式的内容。若不满足,则去掉开头结尾的任何标题、符号、解释说明,只获取其正文部分,用作后续分析与返回。
    
# 2. 检查剧本的内容
#     - 剧本中是否存在:常见的连接词,比如:那么、现在、而且、好了、首先、其次、最后等。若存在,则对其进行润色或删除。
#     - 剧本中是否存在:不适合口播的文字。若存在,则对其进行调整或删除。
#     - 剧本中是否存在:特殊符号和规格参数。若存在,则对其进行调整或删除。
#     - 剧本字数是否满足:1000字左右。若字数严重不足,则对其进行续写。
# ```"""


# PROMPT_REF = """大模型生成的电商直播剧本如下(以```包裹):

# ```
# %s
# ```

# 返回结果要求:使用正文格式直接返回修订后的剧本内容,不要添加任何标题,也不要进行任何解释说明。"""


# def self_reflection():

#     # 参数
#     temperature = 0.8
#     answer_max_token = 4096

#     temperature_ref = 0.8
#     answer_max_token_ref = 4096

#     # 输入
#     data_list = []

#     input_file = "data/train/gen_script_1123.prompt.jsonl"
#     with open(input_file, "r") as f:
#         for line in f:
#             line = line.strip("\n")
#             dic = json.loads(line)
#             data_list.append(dic)

#     # 输出
#     temperature_print = str(temperature).replace(".", "")
#     out_file = "res/gpt/train/" + input_file.split("/")[-1].replace(".jsonl", "") + ".reflection.gpt_temp%s.jsonl" % temperature_print
#     # out_file = "res/gpt/train/" + input_file.split("/")[-1].replace(".jsonl", "") + ".reflection.gpt_tempNo.jsonl"
#     out_file_f = open(out_file, "w")

#     s_time = time.perf_counter()

#     for dic in tqdm(data_list[:]):
#         prompt = dic["prompt"]
#         system = dic["system"]

#         res = request_gpt_once(prompt, system, temperature=temperature, answer_max_token=answer_max_token)
#         dic["gpt_res"] = res

#         # 自我反思
#         prompt_ref = PROMPT_REF % res
#         system_ref = SYSTEM_REF
#         res_ref = request_gpt_once(prompt_ref, system_ref, temperature=temperature_ref, answer_max_token=answer_max_token_ref)
#         dic["prompt_ref"] = prompt_ref
#         dic["system_ref"] = system_ref
#         dic["gpt_res_ref"] = res_ref

#         out_file_f.write(f"{json.dumps(dic, ensure_ascii=False)}\n")

#     out_file_f.close()

#     e_time = time.perf_counter()
#     exec_time = round(e_time - s_time, 3)
#     print(f"多进程执行时间: {exec_time}s")


# self_reflection()

##########################################################################################
  • wx
import sys
import time
import json
import requests
import datetime

from multiprocessing import Pool

from tqdm import tqdm



def get_access_token():

    api_key = 'xxx'
    secret_key = 'yyy'
    
    url = f"https://aip.bxxxxbce.com/oauth/2.0/token?grant_type=client_credentials&client_id={api_key}&client_secret={secret_key}"
    
    payload = json.dumps("")
    headers = {
        'Content-Type': 'application/json',
        'Accept': 'application/json'
    }
    response = requests.request("POST", url, headers=headers, data=payload)
    return response.json().get("access_token")


# settings
TEMPERATURE = 0.01
TOP_P = 0.85
# QUERY_MAX_TOKENS = 2048

WENXIN_MODEL_MAP = {
    "eb3.5": "https://aip.bxxxxbce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/completions",
    "eb3.5_api2": "https://aip.bxxxxbce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/completions_dev",
    # "eb3.5_api2_iter": "https://aip.bxxxxbce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/completions_dev_nightly",
    "eb4.0": "https://aip.bxxxxbce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/completions_pro",
    "eb4_api3": "https://aip.bxxxxbce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/ernie-4.0-ecom-8k",
    "api4": "https://aip.bxxxxbce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/ernie-4.0-8k-preview"
}

WENXIN_TOKEN = "?access_token=" + get_access_token()


def request_wenxin(data, model_type, sleep_time=10, max_retry=6):

    # 请求参数
    request_api = WENXIN_MODEL_MAP[model_type] + WENXIN_TOKEN

    headers = {
        'Content-Type': 'application/json'
    }

    # 请求API
    retry = 0
    while True:
        
        if retry > max_retry:
            print(f"retry nums > {max_retry}, continue to next one ChatCompletion!")
            return "[CALL WENXIN `%s` API ERROR]" % model_type
        
        try:
            response = requests.request("POST", request_api, headers=headers, data=data)
            json_data = response.json()
            # print(json_data)
            return json_data["result"]

        except Exception as e:
            
            now = datetime.datetime.now()
            formatted_now = now.strftime("%Y/%m/%d %H:%M:%S")
            print("[%s]-[Exception][%s]" % (formatted_now, e))
            print("[%s]-[RETRY][call wenxin api, sleeping %ss ...]" % (formatted_now, sleep_time))

            time.sleep(sleep_time)

            retry += 1


def call_wenxin_once(prompt, system, model_type="eb3.5", temperature=0.01, top_p=0.85, answer_max_token=1024, use_default=False):

    if not prompt.strip():
        return "[prompt IS EMPTY]"

    messages = []
    messages.append({"role": "user", "content": prompt})    

    if use_default:
        data = json.dumps(
            {"messages": messages, "system": system if system else ""}
        )
    else:
        data = json.dumps(
            {"messages": messages, "system": system if system else "", "temperature": temperature}
        )

        # data = json.dumps(
        #     {"messages": messages, "system": system if system else "", "temperature": temperature, "top_p": top_p}
        # )

    llm_answer = request_wenxin(data, model_type)
    
    return llm_answer


def call_wenxin_2round(dic, model_type="eb3.5", temperature=0.01, top_p=0.85, answer_max_token=1024):

    system = dic.get("system", "")

    prompt = dic["prompt"]
    answer = dic["wx_res"]
    prompt_2 = dic["prompt_2"]

    if not prompt_2.strip():
        return "[CONTENT IS EMPTY]"

    messages = []
    messages.append({"role": "user", "content": prompt})
    messages.append({"role": "assistant", "content": answer})
    messages.append({"role": "user", "content": prompt_2})

    data = json.dumps(
        {"messages": messages, "system": system if system else "", "temperature": temperature, "top_p": top_p}
    )

    llm_answer = request_wenxin(data, model_type)
    
    return llm_answer


def call_wenxin_once_testing():

    # wenxin_model_type = "eb3.5_api2_iter"
    # wenxin_model_type = "eb4_api3"
    # wenxin_model_type = "eb4.0"
    wenxin_model_type = "api4"
    prompt = "你好,你是谁?"
    system = ""

    res = call_wenxin_once(prompt, system, model_type=wenxin_model_type, temperature=TEMPERATURE, top_p=TOP_P)

    print()
    print("========= LLM API Testing =========")
    print(f"==> Prompt: {prompt}")
    print(f"==> LLM Version: {wenxin_model_type}")
    print(f"==> LLM Response: {res}")
    print("===================================")
    print()


def call_wenxin_once_parallel(dic):
    prompt = dic["prompt"]
    # system = dic["system"]
    system = ""

    res = call_wenxin_once(prompt, system, temperature=TEMPERATURE, top_p=TOP_P)
    dic["gpt_res"] = res

    return dic


### 0. 测试 LLM API 
##########################################################################################

# call_wenxin_once_testing()

##########################################################################################



### 1. 单进程调用(单轮)
##########################################################################################

def call_wenxin_once_single():
    # 输入
    data_list = []

    # input_file = "data/keyi_0508_v2.jsonl"
    input_file = "data/wd_sft_qa_0613.jsonl"
    # input_file = "data/exp_v3_0516.jsonl"
    # input_file = "res/wenxin/wd_sft_0528_v2_1700.wx_eb4_temp001.jsonl"
    with open(input_file, "r") as f:
        for line in f:
            line = line.strip()
            dic = json.loads(line)
            data_list.append(dic)

    # 输出
    # out_file = "res/wenxin/" + input_file.split("/")[-1].replace(".jsonl", "") + ".wx_res.eb35_api2.jsonl"
    # out_file = "res/wenxin/" + "jiulan_0222_gxy.wx_eb4_tempNo.json"
    out_file = "res/wenxin/" + "wd_sft_qa_0613.wx_eb4_api3_tempNo.jsonl"
    # out_file = "res/wenxin/" + "shaoqi_0115.wx_eb4_tempNo.jsonl"
    # out_file = "res/wenxin/" + "wd_sft_judge_0613.wx_eb4_temp001.jsonl"
    out_file_f = open(out_file, "w")

    s_time = time.perf_counter()

    # wenxin_model_type = "eb3.5"
    # wenxin_model_type = "eb3.5_api2"
    wenxin_model_type = "eb4.0"
    # wenxin_model_type = "eb4_api3"
    # wenxin_model_type = "api4"

    # temperature = 0.01
    # temperature = 1.0

    for idx, dic in tqdm(enumerate(data_list[:])):
        prompt = dic["prompt"]
        system = dic.get("system", "")
        prev_res = dic.get("wx_res", "")

        dic["wenxin_model_type"] = wenxin_model_type

        if not prompt or (prev_res and "API ERROR" not in prev_res):
            out_file_f.write(f"{json.dumps(dic, ensure_ascii=False)}\n")
            continue

        # res = call_wenxin_once(prompt, system, model_type=wenxin_model_type, temperature=0.8, top_p=0.8)
        res = call_wenxin_once(prompt, system, model_type=wenxin_model_type, temperature=0.01)
        # res = call_wenxin_once(prompt, system, model_type=wenxin_model_type, use_default=True)
        
        dic["wx_res"] = res
        
        # print(idx)

        out_file_f.write(f"{json.dumps(dic, ensure_ascii=False)}\n")

    out_file_f.close()

    e_time = time.perf_counter()
    exec_time = round(e_time - s_time, 3)
    print(f"进程执行时间: {exec_time}s")


call_wenxin_once_single()

##########################################################################################



### 1. 单进程调用(多轮)
##########################################################################################

# def call_wenxin_multi():
#     # 输入
#     data_list = []

#     # input_file = "data/yuxuan_1114.sheet2.round1.wx_res.jsonl"
#     input_file = "res/wenxin/yuxuan_1114.sheet2.round1.wx_res.eb35_api2.jsonl"
#     with open(input_file, "r") as f:
#         for line in f:
#             line = line.strip("\n")
#             dic = json.loads(line)
#             dic["prompt_2"] = "将回复内容扩充至120字,增加以答案内容为主题的讲述,并且最后引导下单购买\n输出格式:\n'''json\n{\ntype:无参考信息/有参考信息,\ncontent:在有参考信息时按照参考信息输出答案,无参考信息时需要遵循人设并自助进行回答\n}\n'''"
#             data_list.append(dic)

#     # 输出
#     # out_file = "res/wenxin/" + input_file.split("/")[-1].replace(".jsonl", "") + ".wx_res.jsonl"
#     out_file = "res/wenxin/" + "yuxuan_1114.sheet2.round2.wx_res.eb35_api2.jsonl"
#     out_file_f = open(out_file, "w")

#     s_time = time.perf_counter()

#     wenxin_model_type = "eb3.5_api2"
#     # wenxin_model_type = "eb3.5_api2_iter"

#     for dic in tqdm(data_list):

#         res = call_wenxin_2round(dic, model_type=wenxin_model_type, temperature=TEMPERATURE, top_p=TOP_P)
#         dic["answer_2"] = res

#         out_file_f.write(f"{json.dumps(dic, ensure_ascii=False)}\n")

#     out_file_f.close()

#     e_time = time.perf_counter()
#     exec_time = round(e_time - s_time, 3)
#     print(f"多进程执行时间: {exec_time}s")


# call_wenxin_multi()

##########################################################################################



### 2. 多进程调用
##########################################################################################

# data_list = []

# input_file = "data/gpt.shaoqi_1103.jsonl"
# with open(input_file, "r") as f:
#     for line in f:
#         line = line.strip("\n")
#         dic = json.loads(line)
#         data_list.append(dic)


# s_time = time.perf_counter()

# POOL_SIZE = 10
# with Pool(POOL_SIZE) as pool:
#     result_list = pool.map(call_wenxin_once_parallel, data_list)
# pool.close()
# pool.join()

# e_time = time.perf_counter()
# exec_time = round(e_time - s_time, 3)
# print(f"多进程执行时间: {exec_time}s")


# ### 程序执行完成后保存结果
# # output_file_final = input_file.replace(".jsonl", "") + ".gpt_res.jsonl"
# output_file_final = input_file.replace(".jsonl", "") + ".gpt_res_with_system.jsonl"
# with open(output_file_final, "w") as f:
#     for dic in result_list:
#         f.write(f"{json.dumps(dic, ensure_ascii=False)}\n")

##########################################################################################

Pandas

实现 SQL 中的 row_number 排序(分组排序、递增排序)

Jupyter Notebook

查看正在run的每个notebook
  • jupyter notebook list:可以查看正在run的每个notebook,可以查询其token
新增其他的Python环境
# Step 1:使用conda切换到对应环境,比如:conda activate chatglm-6b

# Step 2:安装ipykernel
pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple # 换源
pip install ipykernel

# Step 3:使用python来指定安装训练环境,比如:py39
python -m ipykernel install --name py39

pymysql

  • 获取数据库db实例
def get_mysql_db():
    """
    获取MySQL数据库实例
    """

    # 获取对应的BNS信息
    retry = 0
    while retry < 30:
        try:
            bns_instance = bns.BNS(MYSQL_BNS, auto_start=False).get_instance()
            break
        except:
            time.sleep(0.5)
            retry += 1
    
    ip = bns_instance['ip']
    port = int(bns_instance['port'])

    # 创建MySQL数据库实例
    db = pymysql.connect(
        host=ip,
        port=port,
        user='MYSQL_USER',
        password='MYSQL_PASSWORD',
        database='MYSQL_DATABASE',
        # charset='utf8mb4'
    )

    text = "[INFO][MySQL Database: `%s`]" % db
    print_time(text)

    return db
  • 操作db
# db基本操作
SQL_OP_SELECT = "SELECT id, spu_id, platform_id FROM %s WHERE status=0"
SQL_OP_SELECT_GOODS_INFO = "SELECT goods_id, data FROM %s WHERE goods_id=%s"
SQL_OP_UPDATE = "UPDATE %s SET status=1 WHERE id=%s"
SQL_OP_UPDATE_REQUEST_STATUS = "UPDATE %s SET %s=%s,%s=%s WHERE id=%s"

def mysql_db_select(db, sql):
    """
    读取database
    """

    try:
        db.ping() # 长时间可能使得链接失效,需重新连接数据库
        
        cursor = db.cursor()
        cursor.execute(sql)
        db.commit()

        fetch = cursor.fetchall()
        status = 1

    except Exception as e:
        db.rollback()

        fetch = ()
        status = 0

    return fetch, status

def mysql_db_update(db, sql):
    """
    更新database
    """

    try:
        db.ping()

        cursor = db.cursor()
        cursor.execute(sql)
        db.commit()

        status = 1

    except Exception as e:
        db.rollback()
        
        status = 0

    return status

Redis

  • 获取redis实例
def get_redis_client():
    """
    获取Redis实例
    """

    # 获取对应的BNS信息
    retry = 0
    while retry < 30:
        try:
            bns_instance = bns.BNS(REDIS_BNS, auto_start=False).get_instance()
            break
        except:
            time.sleep(0.5)
            retry += 1

    ip = bns_instance['ip']
    port = int(bns_instance['port'])

    # 创建Redis实例
    redis_client = redis.Redis(host=ip, port=port, db=REDIS_DATABASE_INDEX)
    # redis_client.select(index=1) # 低版本Redis中不存在select方法

    text = "[INFO][Redis Client: `%s`]" % redis_client
    print_time(text)

    return redis_client
  • 操作Redis
# 写入
def write_to_redis(redis_client, redis_key, data="{1: 2}"):
    """
    把数据写入redis
    """

    # 把redis_key和data写入Redis
    if isinstance(data, str):
        redis_client.set(redis_key, data)
    else:
        redis_client.set(redis_key, json.dumps(data, ensure_ascii=False))

    # 设置redis_key的过期时间
    redis_client.expire(redis_key, REDIS_KEY_EXPIRE_TIME)

llama factory

  • pt_run.sh(预训练)
# kill现有GPU进程
lsof /dev/nvidia* | awk '{print $2}' | tail -n +2 | sort | uniq | xargs -I {} kill -9 {}

# export NCCL_P2P_LEVEL=NVL
# export NCCL_P2P_DISABLE=1

### 参数配置
# 训练模式
stage="pt"
finetuning_type="full" # "lora", "freeze", "full"

# 模型路径
model_name_or_path="/root/paddlejob/Baichuan2-7B-Chat/"
template="baichuan2"

# 数据路径
dataset_dir="data/pretrain/0114"
dataset="pt_data_0111.ml_5000"
# dataset_dir="data/pretrain/1127"
# dataset="douyin.ml_1600"

# 数据缓存路径
dataset_save_path="${dataset_dir}/processed"
# mkdir -p ${dataset_save_path}

# 保存路径
save_model_name="douyin.ml_1600.bc2_7b"
output_dir="./ckpt/pretrain/${save_model_name}"

if [ ! -d "${output_dir}" ]; then
    mkdir -p ${output_dir}
fi

# 训练参数
num_train_epochs=2

per_device_train_batch_size=4
gradient_accumulation_steps=2

learning_rate=3e-5
lr_scheduler_type="cosine"

lora_target="q_proj,v_proj"

# 日志参数
logging_steps=10
save_steps=300

deepspeed_config_name_or_path="./deepspeed_config/deepspeed_config_pretrain.json"
# deepspeed_config_name_or_path="./deepspeed_config/deepspeed_config_pretrain_stage3.json"

# 其他参数
num_gpus=8
master_port=9903


### 多GPU(单机多卡)
deepspeed --num_gpus ${num_gpus} --master_port=${master_port} src/train_bash.py \
    --stage ${stage} \
    --model_name_or_path ${model_name_or_path} \
    --template ${template} \
    --do_train \
    --dataset_dir ${dataset_dir} \
    --dataset ${dataset} \
    --cache_path ${dataset_save_path} \
    --finetuning_type ${finetuning_type} \
    --lora_target ${lora_target} \
    --output_dir ${output_dir} \
    --overwrite_cache \
    --per_device_train_batch_size ${per_device_train_batch_size} \
    --gradient_accumulation_steps ${gradient_accumulation_steps} \
    --lr_scheduler_type ${lr_scheduler_type} \
    --logging_steps ${logging_steps} \
    --save_steps ${save_steps} \
    --learning_rate ${learning_rate} \
    --num_train_epochs ${num_train_epochs} \
    --plot_loss \
    --fp16 \
    --deepspeed ${deepspeed_config_name_or_path} \
    --flash_attn ${flash_attn}


### 单GPU
# CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
    # --stage pt \
    # --model_name_or_path path_to_llama_model \
    # --do_train \
    # --dataset wiki_demo \
    # --finetuning_type lora \
    # --lora_target q_proj,v_proj \
    # --output_dir path_to_pt_checkpoint \
    # --overwrite_cache \
    # --per_device_train_batch_size 4 \
    # --gradient_accumulation_steps 4 \
    # --lr_scheduler_type cosine \
    # --logging_steps 10 \
    # --save_steps 1000 \
    # --learning_rate 5e-5 \
    # --num_train_epochs 3.0 \
    # --plot_loss \
    # --fp16
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值