载入数据----------
import pandas as pd
train_cn = pd.read_excel('./汽车领域多语种迁移学习挑战赛初赛训练集/中文_trian.xlsx')
train_ja = pd.read_excel('./汽车领域多语种迁移学习挑战赛初赛训练集/日语_train.xlsx')
train_en = pd.read_excel('./汽车领域多语种迁移学习挑战赛初赛训练集/英文_train.xlsx')
test_ja = pd.read_excel('testA.xlsx', sheet_name='日语_testA')
test_en = pd.read_excel('testA.xlsx', sheet_name='英文_testA')
dataset = [train_cn,train_ja,train_en,test_ja,test_en]
#查看代码大小以及信息
for data in dataset:
print(data.shape)
train_cn.info()
文本分析与文本分词-----------
#中文分词
import jieba
def cutword(txt):
return jieba.lcut(txt)
train_cn['分词'] = train_cn['原始文本'].apply(cutword)
#英文分次
import nagisa
def cutword_jp(txt):
words = nagisa.tagging(txt)
return words.words
train_ja['分词'] = train_ja['原始文本'].apply(cutword_jp)
TFIDF与文本分类---------