时期实体识别


import re
from datetime import datetime,timedelta
from dateutil.parser import parse
import jieba.posseg as psg

UTIL_CN_NUM = {
    '零': 0, '一': 1, '二': 2, '两': 2, '三': 3, '四': 4,
    '五': 5, '六': 6, '七': 7, '八': 8, '九': 9,
    '0': 0, '1': 1, '2': 2, '3': 3, '4': 4,
    '5': 5, '6': 6, '7': 7, '8': 8, '9': 9
}

UTIL_CN_UNIT = {'十': 10, '百': 100, '千': 1000, '万': 10000}

def cn2dig(src):
    if src == "":
        return None
    m = re.match("\d+", src)
    if m:
        return int(m.group(0))
    rsl = 0
    unit = 1
    for item in src[::-1]:
        if item in UTIL_CN_UNIT.keys():
            unit = UTIL_CN_UNIT[item]
        elif item in UTIL_CN_NUM.keys():
            num = UTIL_CN_NUM[item]
            rsl += num * unit
        else:
            return None
    if rsl < unit:
        rsl += unit
    return rsl

def year2dig(year):
    res = ''
    for item in year:
        if item in UTIL_CN_NUM.keys():
            res = res + str(UTIL_CN_NUM[item])
        else:
            res = res + item
    m = re.match("\d+", res)
    if m:
        if len(m.group(0)) == 2:
            return int(datetime.datetime.today().year/100)*100 + int(m.group(0))
        else:
            return int(m.group(0))
    else:
        return None

def parse_datetime(msg):
    if msg is None or len(msg) == 0:
        return None

    try:
        dt = parse(msg, fuzzy=True)
        return dt.strftime('%Y-%m-%d %H:%M:%S')
    except Exception as e:
        m = re.match(
            r"([0-9零一二两三四五六七八九十]+年)?([0-9一二两三四五六七八九十]+月)?([0-9一二两三四五六七八九十]+[号日])?([上中下午晚早]+)?([0-9零一二两三四五六七八九十百]+[点:\.时])?([0-9零一二三四五六七八九十百]+分?)?([0-9零一二三四五六七八九十百]+秒)?",
            msg)
        if m.group(0) is not None:
            res = {
                "year": m.group(1),
                "month": m.group(2),
                "day": m.group(3),
                "hour": m.group(5) if m.group(5) is not None else '00',
                "minute": m.group(6) if m.group(6) is not None else '00',
                "second": m.group(7) if m.group(7) is not None else '00',
            }
            params = {}

            for name in res:
                if res[name] is not None and len(res[name]) != 0:
                    tmp = None
                    if name == 'year':
                        tmp = year2dig(res[name][:-1])
                    else:
                        tmp = cn2dig(res[name][:-1])
                    if tmp is not None:
                        params[name] = int(tmp)
            target_date = datetime.today().replace(**params)
            is_pm = m.group(4)
            if is_pm is not None:
                if is_pm == u'下午' or is_pm == u'晚上' or is_pm =='中午':
                    hour = target_date.time().hour
                    if hour < 12:
                        target_date = target_date.replace(hour=hour + 12)
            return target_date.strftime('%Y-%m-%d %H:%M:%S')
        else:
            return None



def check_time_valid(word):
    m = re.match("\d+$", word)
    if m:
        if len(word) <= 6:
            return None
    word1 = re.sub('[号|日]\d+$', '日', word)
    if word1 != word:
        return check_time_valid(word1)
    else:
        return word1

#时间提取
def time_extract(text):

    time_res = []
    word = ''
    keyDate = {'今天': 0, '明天':1, '后天': 2}
    for k, v in psg.cut(text):
        if k in keyDate:
            if word != '':
                time_res.append(word)
            word = (datetime.today() + timedelta(days=keyDate.get(k, 0))).strftime('%Y{y}%m{m}%d{d}%H{h}%M{f}%S{s}').format(y='年', m='月',d='日', h='时', f='分', s='秒')
        elif word != '':
            if v in ['m', 't']:
                word = word + k
            else:
                time_res.append(word)
                word = ''
        elif v in ['m', 't']:
            word = k
    if word != '':
        time_res.append(word)
    result = list(filter(lambda x: x is not None, [check_time_valid(w) for w in time_res]))
    final_res = [parse_datetime(w) for w in result]

    return [x for x in final_res if x is not None]

text1 = '我要住到明天下午三点'
print(text1, time_extract(text1), sep=':')

text2 = '预定28号的房间'
print(text2, time_extract(text2), sep=':')

text3 = '我要从26号下午4点住到11月2号'
print(text3, time_extract(text3), sep=':')

text4 = '我要预订今天到30的房间'
print(text4, time_extract(text4), sep=':')

text5 = '今天30号呵呵'
print(text5, time_extract(text5), sep=':')
以下是使用bert-base-chinese训练实体识别模型的代码示例: 1. 准备数据集 首先,需要准备实体识别任务的数据集。数据集应该包含标记好的实体标签,例如“B-PER”和“I-PER”表示人名实体的开始和内部标记。 2. 定义模型 定义一个使用bert-base-chinese预训练模型的实体识别模型,可以使用PyTorch和Transformers库: ```python import torch from transformers import BertForTokenClassification, BertTokenizer model = BertForTokenClassification.from_pretrained('bert-base-chinese', num_labels=5) tokenizer = BertTokenizer.from_pretrained('bert-base-chinese') ``` 在这里,我们使用“num_labels”参数指定模型输出的标签数,即实体类别数。 3. 定义训练循环 接下来,我们定义训练循环来训练我们的模型: ```python from torch.utils.data import DataLoader, RandomSampler, SequentialSampler from transformers import AdamW, get_linear_schedule_with_warmup device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) train_data = ... # 加载训练数据集 train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32) optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8) total_steps = len(train_dataloader) * 3 scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) for epoch in range(3): for batch in train_dataloader: model.train() batch = tuple(t.to(device) for t in batch) inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3]} outputs = model(**inputs) loss = outputs[0] loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() optimizer.zero_grad() ``` 在这里,我们使用随机采样器将训练数据集的数据随机分成小批次。我们使用AdamW优化器和带有线性学习率调度程序的预热来训练模型。在每个时期内,我们遍历每个小批次并计算损失和梯度,然后更新模型参数。 4. 评估模型 训练完成后,我们可以使用测试数据集来评估模型的性能: ```python test_data = ... # 加载测试数据集 test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=32) model.eval() predictions = [] true_labels = [] for batch in test_dataloader: batch = tuple(t.to(device) for t in batch) inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3]} with torch.no_grad(): outputs = model(**inputs) logits = outputs[1].detach().cpu().numpy() label_ids = inputs['labels'].cpu().numpy() predictions.extend([list(p) for p in np.argmax(logits, axis=2)]) true_labels.extend(label_ids) from sklearn.metrics import f1_score f1_score = f1_score(true_labels, predictions, average='weighted') print("F1 score:", f1_score) ``` 在这里,我们将测试数据集分成小批次,并使用模型的“eval”方法来计算预测标签。然后,我们使用f1_score度量来评估模型性能。 这就是使用bert-base-chinese训练实体识别模型的代码示例。
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值