分割name
把name分割成中文名和外文名
连接数据库
从setting.json中读取MongoDB的host、用户名和密码
获取movie库
//连接数据库
with open('setting.json') as f:
setting = json.load(f)
db = MongoClient("mongodb://{}:27017/movie".format(setting['host']),username=setting['username'],password=setting['password'])["movie"]
cc = opencc.OpenCC('t2s')
百度翻译api
appid = '20200623000504514'
secretKey = 'LC_mF3p9AiAfUhHbzAGs'
def common_char(string1, string2):
for c in string1:
if c in string2:
return True
return False
for item in db['details'].find({'source': 'douban', "name": {"$regex": "[\u4e00-\u9fa5].* .*[^\u4e00-\u9fa5\d]"}},
no_cursor_timeout=True):
//选出nameFrn为空的数据
if item.get('nameFrn'):
if item['nameFrn'].strip() == '':
item['nameFrn'] = ''
else:
continue
先用正则表达式分割
**
match = re.match('(.*[\u4e00-\u9fa5]) ([A-Za-z !.,:-?&()\d]+)$', item['name'])
name=""
nameFrn=""
if match:
name = match.group(1)
nameFrn = match.group(2)**
对正则表达式分割失败的name,调用百度翻译api中的语种识别api进行分割
else:
salt = random.randint(32768, 65536)
tokens = item['name'].split(' ')
flag = False
for token in tokens:
if flag:
nameFrn += (token + " ")
continue
sign = appid + token + str(salt) + secretKey
sign = hashlib.md5(sign.encode()).hexdigest()
shiBieUrl = '/api/trans/vip/language'
shiBieUrl = shiBieUrl + '?appid=' + appid + '&q=' + urllib.parse.quote(
token) + '&salt=' + str(salt) + '&sign=' + sign
httpClient = None
httpClient = http.client.HTTPConnection('api.fanyi.baidu.com')
httpClient.request('GET', shiBieUrl)
response = httpClient.getresponse()
result_all = response.read().decode("utf-8")
result = json.loads(result_all)
if result['error_code'] != 0 or result['data']['src'] != "zh":
flag = True
nameFrn += (token + " ")
else:
name += (token + " ")
name = name.strip()
nameFrn = nameFrn.strip()
if name == "":
name = nameFrn
print(name+"|*********|"+nameFrn)
if name!="":
db['details'].update_one({"sourceId": item["sourceId"],'source': 'douban'},{"$set":{"name":name,"nameFrn":nameFrn}})
但当这种组合出现时会导致中日文无法正确识别
做进一步处理
**//对部分中文日文无法准确识别的name,做进一步处理
for item in db['details'].find({'source': 'douban', "name": {"$regex": "[\u4e00-\u9fa5].* .*[^\u4e00-\u9fa5\d]"}},
no_cursor_timeout=True):
tokens = item['name'].split(' ')
if re.search('[\u3040-\u31FF\uAC00-\uD7AF\u1100-\u11FF]', tokens[0]):
continue
if len(tokens) == 2 and re.search('[\u3040-\u31FF\uAC00-\uD7AF\u1100-\u11FF]', tokens[1]):
item['name'] = tokens[0]
item['nameFrn'] = tokens[1]
db['details'].update_one({'_id': item['_id']}, item)
//日文
elif re.search('[\u3040-\u31FF]', item['name']):
for i in range(1, len(tokens)):
if tokens[i] == '':
continue
potential = cc.convert(tokens[i])
if common_char(potential[:-1], tokens[0][:-1]):
item['name'] = ' '.join(tokens[:i])
item['nameFrn'] = ' '.join(tokens[i:])
# print(item['name'], '|----|', item['nameFrn'])
db['details'].update_one({'_id': item['_id']}, item)
break**