#!/usr/bin/env python#-*- coding: utf-8 -*-
importConfigParserimportjsonimportosimportrefrom re importsubimportsysimporttimeimportrequestsfrom pdfminer.converter importPDFPageAggregatorfrom pdfminer.layout importLTTextBoxHorizontal, LAParamsfrom pdfminer.pdfdocument importPDFDocumentfrom pdfminer.pdfinterp importPDFResourceManager, PDFPageInterpreterfrom pdfminer.pdfpage importPDFPagefrom pdfminer.pdfpage importPDFTextExtractionNotAllowedfrom pdfminer.pdfparser importPDFParserfrom qiniu importAuthfrom qiniu importetagfrom qiniu importput_fileimportlog_configfrom OP_Mysql importget_connectionfrom HTMLParser importHTMLParserimportrandom
reload(sys)
sys.setdefaultencoding('utf-8')
logger= log_config.getlogger('analysis_neeq_content', 'analysis_neeq_content.log')
conf=ConfigParser.ConfigParser()
conf.read("mysql.conf")
neeq_remainder= conf.get("basic_config", "neeq_remainder")
neeq_server_num= conf.get("basic_config", "neeq_server_num")
neeq_start_id= conf.get("basic_config", "neeq_start_id")
neeq_json_path= conf.get("basic_config", "neeq_json_path")
neeq_json= conf.get("basic_config", "neeq_json")
json_suffix= '.json'neeq_id= conf.get("basic_config", "neeq_id")
neeq_file_path= conf.get("basic_config", "neeq_file_path")
access_key= conf.get("basic_config", "access_key")
secret_key= conf.get("basic_config", "secret_key")
bucket= conf.get("basic_config", "bucket")classanalysis:def __init__(self):#用于文件追加
self.count =0
self.neeq_json=neeq_json
self.headers= {'Host': 'www.neeq.com.cn','User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36'}
self.create_init_dirtory()#创建初始文件夹
defcreate_init_dirtory(self):if notos.path.exists(neeq_json_path):
os.makedirs(neeq_json_path)if notos.path.exists(neeq_file_path):
os.makedirs(neeq_file_path)#mysql 获取数据
defget_data(self):
with get_connection() as db:#SQL 查询语句
count = r"SELECT COUNT(*) as num FROM ssb_insight_neeq WHERE pro_status = 0 AND neeq_id <= %s and %s = (neeq_id %% %s)"logger.info("now excute sql script sql = %s" %count)try:#获取所有记录列表
db.cursor.execute(count, [neeq_start_id, neeq_remainder, neeq_server_num])
counts=db.cursor.fetchall()
num= counts[0]['num']
logger.info("now rows num = %s" %num)if 0 != num % 1000:
pages= num / 1000 + 1
else:
pages= num / 1000start_rows= 1000
for i inrange(0, pages):
start_page= i * 1000sql= "SELECT t.sec_code,t.sec_name,t.title,t.doc_type,t.doc_type_key,c.industry1,c.industry2,"\"t.url,t.public_time,t.content,t.pro_status,t.module,t.es_id FROM ssb_insight_neeq t"\"LEFT JOIN ssb_d_listed_company c ON t.sec_code = c.secCode WHERE t.pro_status = 0 and t.neeq_id <= %s"\"AND %s = (t.neeq_id %% %s) ORDER BY t.neeq_id DESC LIMIT %s ,%s"db.cursor.execute(sql, [neeq_start_id, neeq_remainder, neeq_server_num, start_page, start_rows])
result_datas=db.cursor.fetchall()#1000 数据放入此数组
json_data =[]
es_id_file_addr=[]for row inresult_datas:
item={}
es_obj={}
result= {'secCode': row['sec_code'],'secName': row['sec_name'],'title': row['title'],'docType': row['doc_type'].split(','),'docTypeKey': row['doc_type_key'].split(','),'url': row['url'],'publicTime': row['public_time'],'industry1': row['industry1'],'industry2': row['industry2'],'content': row['content'],'proStatus': bool(row['pro_status']),'module': row['module'],
}
file_url= row['url']
self.download_file(file_url)
file_name= re.findall(r".*/(.*)", file_url)[0]
file_paths= neeq_file_path +file_nameifos.path.exists(file_paths):
content=self.analysis_file_content(file_paths)
self.upload_qiniu(file_paths)
self.del_file(file_paths)if content == '':continueresult['content'] =contentelse:
logger.warn("file_url %s download fail" %file_url)continueitem['id'] = row['es_id']
item['data'] =result
json_data.append(item)
es_obj['es_id'] = row['es_id']
es_obj['file_addr'] =file_paths
es_id_file_addr.append(es_obj)
self.write_json_file(json_data)
self.write_es_id_file_addr(es_id_file_addr)exceptException as e:
logger.error("Error: unable to fecth data Exception %s" %e)defwrite_json_file(self, json_data):#写数据
json_path = neeq_json_path + self.neeq_json +json_suffix
rows=self.get_json_rows(json_path)if rows > 100000:
self.count= self.count + 1self.neeq_json= neeq_json +str(self.count)
json_path= neeq_json_path + self.neeq_json +json_suffix
with open(json_path,'a') as es_file:for jsonitem injson_data:
jsondatar= json.dumps(jsonitem, ensure_ascii=True)
es_file.write(jsondatar+"\n")defwrite_es_id_file_addr(self, es_id_data):#写入es_id,以及 七牛云 地址
with open(neeq_id, 'a') as es_id_file:for jsonitem ines_id_data:
es_id_file.write(jsonitem['es_id']+","+jsonitem['file_addr']+";"+"\n")#获取json文件行数,用于分文件存储
defget_json_rows(self, json_path):
count=0if notos.path.exists(json_path):return0
thefile= open(json_path, 'rb')whileTrue:
buffer= thefile.read(8192 * 1024)if notbuffer:breakcount+= buffer.count('\n')
thefile.close()returncount#上传文件
defupload_qiniu(self, file_path_name):
q=Auth(access_key, secret_key)#生成上传 Token,可以指定过期时间等
token = q.upload_token(bucket, file_path_name, 3600)#要上传文件的本地路径
ret, info =put_file(token, file_path_name, file_path_name)#logger.info(info)
if info.status_code != 200:
logger.info("file upload qiniuyun fail %s" %file_path_name)#删除文件
defdel_file(self, file_path_name):ifos.path.exists(file_path_name):
os.remove(file_path_name)else:
logger.info("%s 文件不存在" %file_path_name)#下载文件
defdownload_file(self, file_url):
time.sleep(random.uniform(1, 2))
retry=0try:while retry < 3:
file_name= re.findall(r".*/(.*)", file_url)[0]
response= requests.get(file_url, stream=True, headers=self.headers, timeout=5)if response.status_code ==requests.codes.ok:
with open(neeq_file_path+ file_name, "wb") as code:for chunk in response.iter_content(chunk_size=1024):ifchunk:
code.write(chunk)break
exceptException as e:
logger.exception(e)
retry+= 1
#解析文件
defanalysis_file_content(self, filename):
content= ''fenzhihouzhui= re.findall(r'.*(\..*)', str(filename))[0]if fenzhihouzhui == '.pdf' or fenzhihouzhui == '.PDF':
content=self.analysis_pdf_file_content(filename)elif fenzhihouzhui == '.html' or fenzhihouzhui == '.HTML':
content=self.analysi_html_file_content(filename)returncontentdefanalysis_pdf_file_content(self, filename):
content= ''
try:
fileobject= open(filename, 'rb')
parser=PDFParser(fileobject)
document=PDFDocument(parser)if notdocument.is_extractable:raisePDFTextExtractionNotAllowedelse:
rsrcmgr=PDFResourceManager()
laparams=LAParams()
device= PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter=PDFPageInterpreter(rsrcmgr, device)for page inPDFPage.create_pages(document):
interpreter.process_page(page)
layout=device.get_result()for x inlayout:ifisinstance(x, LTTextBoxHorizontal):
results= x.get_text().encode('utf-8')
content+=results
fileobject.close()exceptException as e:
logger.error("analysis pdf file fail : %s" %e)returncontentdefanalysi_html_file_content(self, filename):
content_open= open(filename, 'rb')
contents=content_open.read()printcontents
contents=dehtml(contents)classpythonNToTxt(HTMLParser):def __init__(self):
HTMLParser.__init__(self)
self.__text =[]defhandle_data(self, data):
text=data.strip()if len(text) >0:
text= sub('[ \t\r\n]+', ' ', text)
self.__text.append(text + ' ')defhandle_starttag(self, tag, attrs):if tag == 'p':
self.__text.append('\n\n')elif tag == 'br':
self.__text.append('\n')defhandle_startendtag(self, tag, attrs):if tag == 'br':
self.__text.append('\n\n')deftext(self):return ''.join(self.__text).strip()defdehtml(text):try:
parser=pythonNToTxt()
parser.feed(text)
parser.close()returnparser.text()exceptException as e:
logger.error("html analysis excepiton : %s" %e)returntext
logger.info("analysis neeq content start,now params neeq_remainder=%s,neeq_start_id =%s,neeq_json = %s,neeq_id = %s ,neeq_file_path = %s" %(neeq_remainder, neeq_start_id, neeq_json, neeq_id, neeq_file_path))
analysis=analysis()
analysis.get_data()