如何获取数据点击这里
下载之后的文件名为:all_results.csv
数据样式大概这样。然后下面我分析的是工作要求 也就是那边的绿框那一列。
import csv
import os
import jieba
import jieba.posseg as psg #posseg模块可以获取词性
datapath=os.path.join(os.getcwd(),"all_results.csv")
with open(datapath,'r',newline='',encoding='utf-8') as csvfile:
# rows=csv.reader(csvfile)
# headers = next(rows)
# for i ,row in enumerate(rows):
# if i%50==0:
# print("正在处理第{}行数据".format(i))
# job_required=row[8]
# job_requirednew=job_required.strip().replace(" ","")
# result_list.append(job_requirednew)
rows=csv.DictReader(csvfile)
result_list=[row['job_description'].strip().replace('\xa0','').replace('\r\n','') for row in rows]
info_attr = [(x.word,x.flag) for x in psg.cut(''.join(result_list)) if len(x.word) >= 2] # 这里的x.word为词本身,x.flag为词性
with open('out.txt','w+') as f:
for x in info_attr:
f.write('{0}\t{1}\n'.format(x[0],x[1]))
运行完上面的程序得到的文件结构如下