编码BUG:
UnicodeEncodeError: 'gbk' codec can't encode character '\uff62' in position 34: illegal multibyte sequence :
解决方案
import sys
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030') #改变标准输出的默认编码
自己电脑读取不了 别的电脑就可以
原因分析:
cmd不能很好地兼容utf8,而IDLE就可以,甚至在IDLE下运行,连“改变标准输出的默认编码”都不用,因为它默认就是utf8。如果一定要在cmd下运行,那就改一下编码,比如我换成“gb18030”,就能正常显示了:
print()函数自身有限制,不能完全打印所有的unicode字符
解决方案:
其实print()函数的局限就是Python默认编码的局限,因为系统是win7的,python的默认编码不是’utf-8’,改一下python的默认编码成’utf-8’就行了
import io
import sys
import urllib.request
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8') #改变标准输出的默认编码
res=urllib.request.urlopen('http://www.baidu.com')
htmlBytes=res.read()
print(htmlBytes.decode('utf-8'))
###
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2022/4/20 9:21
# @Author : wenjing
# @File : 商品_BIO——NER.py
import sys
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030') #改变标准输出的默认编码
import os
BASE_DIRL = os.path.dirname(__file__)
raw_data_path = path = os.path.join(BASE_DIRL, "ner_dataset.txt")
def text2idx(text, data):
start = text.find(data)
return start, len(data)
def label_storage(text, index_start, len_, y):
labels = len(text) * ['O']
labels[index_start:index_start + len_] = ['B_' + y] + ['I_' + y] * (len_ - 1)
return labels
def text_label():
dic = {}
dic2 = {}
with open(raw_data_path, 'r', encoding='UTF-8') as f:
for i in f:
text, y, node_text = i.replace('\n', "").split(" ")
index_start, len_ = text2idx(text, node_text) # 对应的位置
labels = label_storage(text, index_start, len_, y)
if dic.get(text):
dic[text].append(labels)
else:
dic[text] = [labels]
for k in dic:
dic2[k] = []
for labels in zip(*dic[k]):
lab = set(labels)
if len(lab) == 2:
lab.remove('O')
dic2[k].append(*lab)
return dic2
import json
def exls_label():
with open('./ner_dev.json','r',encoding='utf-8') as f:
print(f.read())
if __name__ == '__main__':
print('a' == '\u0061')
exls_label()
# exls_label()
# res = text_label()
# for k in res:
# print(k)
# print(res[k])