2020.6.7 更新:添加了异常处理机制,解决由于网络不好返回空值时程序崩溃的问题
2020.6.7 更新:如果程序之前可以正常运行,某次开始突然卡在读取文件编码那一步,将csv中的内容复制出来到一个新的csv文件。这个是文件的问题不是代码的问题。
————————————————————————————————————————————————————————
我好垃圾我除了调API/库之外什么也不会。
一. 获取自己的AK和SK
教程:https://ai.baidu.com/ai-doc/NLP/4k6z5cykb
二. 原始数据
数据来源:微博
将text列的值作为发送请求时的值
三.代码
1.调API时返回结果的格式为:
"items": [{positive_prob": 0.0148066, "confidence": 0.967096, "negative_prob": 0.985193, "sentiment": 0}]
2.更改自己的AK、SK、原始数据文件地址、生成文件地址后即可使用
3.代码的逻辑为:(读取数据) —— (循环:调用API - 得到结果 - 写入csv)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jun 5 15:15:41 2020
@author: SerenaHuang
"""
# coding=utf-8
import sys
import json
import base64
import time
import pandas as pd
from collections import OrderedDict
import csv
import chardet
import os
os.chdir('/Users/Desktop') #生成文件的存储路径
file="/Users/Downloads/comment/评论.csv" #原始数据文件的路径
# make it work in both python2 both python3
IS_PY3 = sys.version_info.major == 3
if IS_PY3:
from urllib.request import urlopen
from urllib.request import Request
from urllib.error import URLError
from urllib.parse import urlencode
from urllib.parse import quote_plus
else:
import urllib2
from urllib import quote_plus
from urllib2 import urlopen
from urllib2 import Request
from urllib2 import URLError
from urllib import urlencode
# skip https auth
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
API_KEY = '你的AK'
SECRET_KEY = '你的SK'
COMMENT_TAG_URL = "https://aip.baidubce.com/rpc/2.0/nlp/v1/sentiment_classify"
""" TOKEN start """
TOKEN_URL = 'https://aip.baidubce.com/oauth/2.0/token'
"""
get token
"""
def fetch_token():
params = {'grant_type': 'client_credentials',
'client_id': API_KEY,
'client_secret': SECRET_KEY}
post_data = urlencode(params)
if (IS_PY3):
post_data = post_data.encode('utf-8')
req = Request(TOKEN_URL, post_data)
try:
f = urlopen(req, timeout=5)
result_str = f.read()
except URLError as err:
print(err)
if (IS_PY3):
result_str = result_str.decode()
result = json.loads(result_str)
if ('access_token' in result.keys() and 'scope' in result.keys()):
if not 'brain_all_scope' in result['scope'].split(' '):
print ('please ensure has check the ability')
exit()
return result['access_token']
else:
print ('please overwrite the correct API_KEY and SECRET_KEY')
exit()
"""
call remote http server
"""
def make_request(url, comment):
print("---------------------------------------------------")
print("评论文本:")
print(" " + comment)
response = request(url, json.dumps(
{
"text": comment,
}))
data = json.loads(response)
result= OrderedDict()
if "error_code" not in data or data["error_code"] == 0:
for item in data["items"]:
result['sentiment']=item['sentiment']
result['negative_prob']=item['negative_prob']
result['positive_prob']=item['positive_prob']
result['confidence']=item['confidence']
else:
# print error response
print(response)
#解决并发超限
if data["error_code"]==18:
while(True):
try:
time.sleep(0.5)
make_request(url, comment)
break
except:
next
# 防止qps超限
time.sleep(0.5)
return result
"""
call remote http server
"""
def request(url, data):
req = Request(url, data.encode('utf-8'))
has_error = False
try:
f = urlopen(req)
result_str = f.read()
if (IS_PY3):
result_str = result_str.decode()
return result_str
except URLError as err:
print(err)
if __name__ == '__main__':
#读取数据
with open(file, 'rb') as f:
encoding=chardet.detect(f.read())['encoding']
df=pd.read_csv(file, encoding=encoding)
comment_list_series=df['text']
comment_list_size=comment_list_series.size
# get access token
token = fetch_token()
# concat url
url = COMMENT_TAG_URL + "?charset=UTF-8&access_token=" + token
result_headers = [
'sentiment',
'negative_prob',
'positive_prob',
'confidence'
]
with open('sentiment_analysis.csv', 'a', encoding='utf-8-sig', newline='') as f:
writer = csv.writer(f)
writer.writerows([result_headers])
for i in range(0,comment_list_size):
try:
result_data = make_request(url, comment_list_series[i]).values()
print(i)
if result_data != None:
writer.writerow(result_data)
except:
result_data = ["","","",""]
writer.writerow(result_data)
print(i)