import json
import jieba,jieba.analyse
import re
#一、读取文件
f=open("D:\\文件名.do",encoding="utf-8")
text=f.read()
print(text[0:10])
print(text[-1])
print(type(text))
#二、用json解析
json=json.loads(text)#有时显示错误,重启编程窗口就好了
type(json)#dict
print(len(json),json.keys())
json['bond']
type(json['queryPage']['list'])==type(list())#True
#三、探索json结构
def select_json(json):#定义展示大小的函数来筛选主要内容
content_count_0={}
#if
type(json)=='list':
if
type(json)==type(list()):
keys=json
for i,key in enumerate(keys):
try:
content_count_0[str(i)+str(type(key))]=len(str(key))
except:
content_count_0[str(i)+type(key)]=0
else:
keys=json.keys()
for key in keys:
try:
content_count_0[key]=len(str(json[key]))
except:
content_count_0[key]=0
print(content_count_0)#展示每一层的字符串长度、以定位主键key
return
content_count_0
content_count_0=select_json(json)
content_count_1=select_json(json['queryPage'])
content_count_2=select_json(json['queryPage']['list'])
contents=json['queryPage']['list']#列表
[len(content) for content in contents]#看看列表内容格式是否固定
contents[0]['lksFields'][4]['value']#题目
contents[0]['lksFields'][0]['value']#内容
[len(content['lksFields']) for content in
contents]#看看列表内容格式是否固定
#四、摘取合并主要内容并去重复【目前还不完整,因为同一层的长度不一致、不能用固定位置法来提取数据】
title_abstract={"标题":["关键词周围摘要","出现次数"]}
for content in contents:
title=content['lksFields'][4]['value']
if title in
title_abstract:
title_abstract[title][1] += 1
else:
title_abstract[title]=[content['lksFields'][0]['value'],1]
#五、全部获取
#s="abcde"
#("a" in s) or ("r" in s) or ("f" in s) #True
#s.replace("c","")
title_abstracts=[]
contain_zh = re.compile(u'[\u4e00-\u9fa5]+')
for content in contents:
for elm in
content['lksFields']:
s =elm['value']
if contain_zh.search(s):#判断字符串中是否包含汉字
#if ("kms" in s) or ("gateway" in s) or ("news"
in s) or ("knowledge" in s):
#
pass
#
else:
title_abstracts.append(s)
contents_str=str(title_abstracts).replace("","").replace("","").replace("","")
print(len(title_abstracts),len(contents_str))
text_file=open("D:\\文件名.txt","w",encoding="utf-8")
text_file.write(contents_str)
text_file.close()
tf_idf_keywords=jieba.analyse.extract_tags(contents_str,topK=500,
withWeight=True,allowPOS=(),
withFlag=True)
#去除换行 o:p以及空格等无用字符
#分句、去重更有效的提取关键信息