501379
关键词:tushare获取 支持向量机 自然语言处理 财经 金融 舆情分析 词频统计
1..导入所需要的库
2.使用tushare获取新闻快讯文本
# 导入tushare
import tushare as ts
# 初始化pro接口
pro = ts.pro_api('此处id需获取')
# 拉取数据
news = pro.jinse(**{
"start_date": "2022-06-01",
"end_date": "2022-06-02",
"limit": "",
"offset": ""
}, fields=[
"title",
"content",
"type",
"url",
"datetime"
])
news.head()
3.文本清洗与分词
#以该处内容为例
message = news.iloc[7][1]
#去无用字符
#message = ''.join(message.split())
#分词 去stopwords
#此处中文处理建议加上encoding='UTF-8'
import jieba
words = ''
stopwords = [line.strip() for line in open(r"stopwords.txt",encoding='UTF-8').readlines()]
word = ' '.join(jieba.cut(message))
for w in word:
if w not in stopwords:
words += w
words
4.CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
with open('message.txt','w') as f:
f.write(words)
vect = CountVectorizer()
f = open('message.txt','r')
vect.fit(f)
f = open('message.txt')
vectors = vect.transform(f)
print(vectors.toarray())
vect.vocabulary_
学习笔记