import jieba.analyse as ana
import pandas as pd
df= pd.read_excel('hotnews.xlsx')
data = pd.DataFrame(df)
diqu = ['保山','施甸','腾冲','龙陵','昌宁','昭通','大关','鲁甸','巧家','水富','绥江','威信','盐津','彝良','永善','镇雄','丽江','永胜','华坪','玉龙','宁蒗','普洱','翠云','江城','景东','景谷','澜沧','孟连','墨江','宁洱','思茅','西盟','镇沅','临沧','凤庆','云县','永德','镇康','双江','耿马','沧源','楚雄','双柏','牟定','南华','姚安','大姚','永仁','元谋','武定','禄丰','大理','祥云','宾川','弥渡','永平','云龙','洱源','剑川','鹤庆','漾濞','南涧','巍山','红河','个旧','开远','绿春','建水','石屏','弥勒','泸西','元阳','红河','金平','河口','屏边','文山','砚山','西畴','麻栗坡','马关','丘北','广南','富宁','西双版纳','版纳','勐海','勐腊','德宏','瑞丽','梁河','盈江','陇川','怒江','福贡','贡山','兰坪','迪庆','德钦','维西','玉溪','麒麟区','宣威','马龙','沾益','富源','罗平','师宗','陆良','会泽','曲靖','红塔区','江川','澄江','通海','华宁','易门','峨山','新平','元江','昆明市','保山市','昭通市','丽江市','普洱市','临沧市','楚雄州','红河州','文山州','西双版纳州','版纳州','版纳','德宏州','怒江州','大理州','迪庆州','玉溪市','曲靖市','昆明','盘龙区','五华区','官渡区','西山区','东川区','安宁','呈贡','晋宁','富民','宜良','嵩明','石林','禄劝','寻甸','云南','云南省']
for i in range(len(data)):
data2 = data.iloc[i]['标题']
# keyword = ana.textrank(data2)
keyword = ana.tfidf(data2)
for j in iter(diqu):
for k in iter(keyword):
if k == j:
data['地区'][i] = j
else:
pass
out_path = 'hotnews_fenlei.csv'
data.to_csv(out_path,encoding="utf_8_sig")
就是搞了一个简单的关键词的分析,地理信息数据。暂时没有试验出来,到底是texttrank好用,还是tfidf好用。