python脚本将mysql中的数据存入Elasticsearch
公司数据分析的项目,需要爬虫获取数据存入msyql,然后将mysql的数据转存到elasticsearch,没有太高效率的需求,只需判断各字段非空即可。简单的python实现代码和ES创建的代码如下:
python从mysql转存到ES
import re
from elasticsearch import Elasticsearch
import time
from bs4 import BeautifulSoup
es = Elasticsearch([
{'host': '192.168.1.126'},
{'host': 'othernode', 'port': 9200, 'url_prefix': 'es', 'use_ssl': True},
])
#与mysql建立连接
import MySQLdb
conn= MySQLdb.connect(host='192.168.1.105',port = 3306,user='oaplatform',passwd='oaplatform',db ='ph_system',charset="utf8")
cursor = conn.cursor()
#connect() 方法用于创建数据库的连接,里面可以指定参数:用户名,密码,主机等信息。
#这只是连接到了数据库,要想操作数据库需要创建游标。
#预处理
def pretreat(str_txt):
str_txt=str(str_txt) # 转字符串
#"去标签"
soup = BeautifulSoup(str_txt, "lxml")
str_txt=soup.get_text()
str_txt = re.sub('\n|\t','',str_txt)
str_txt = re.sub('"','',str_txt)
str_txt = re.sub("'",'',str_txt)
str_txt = re.sub(' ','',str_txt)
str_txt = re.sub("\xa0",'',str_txt)
str_txt = re.sub("\u3000",'',str_txt)
return str_txt
#查询
#判断非空
sql = cursor.execute("select * from xlt_train where sign='102'")
for info in cursor:
print("ok")
time.sleep(0.01) # 睡0.01秒
#print (info[8])
#print(info[0])
if((info[0]!=None) & (info[1]!=None) & (info[2]!=None) & (info[3]!=None)& (info[4]!=None)& (info[5]!=None)& (info[6]!=None)):
#print(info[0],info[8]==None)
id_i=str(info[0])
source_url=str(info[1])
title=pretreat(info[2])
publish_date=str(info[3])
institution=str(info[4])
text_summary=pretreat(info[5])
sign=str(info[6])
data='{"title":"%s","publish_date":"%s","institution":"%s","source_url":"%s","text_summary":"%s"}' %(title, publish_date, institution ,source_url, text_summary)
try:
result = es.create(index='smetrend', doc_type='info', id=id_i, body=data)
print(result)
except Exception:
print (Exception)
continue
cursor.close()
#提交
conn.commit()
#关闭
conn.close()
创建ES的代码
PUT smetrend
{
"mappings": {
"info":{
"properties": {
"title":{
"store":true,
"type": "text",
"analyzer":"ik_max_word",
"fielddata": true
},
"publish_data":{
"store":true,
"type": "keyword"
},
"source_url":{
"store":true,
"type": "keyword"
},
"text_summary":{
"store":true,
"type": "text",
"analyzer":"ik_max_word",
"fielddata": true
},
"publish_institution":{
"store":true,
"type": "text",
"analyzer":"ik_max_word",
"fielddata": true
},
"sign":{
"store":true,
"type": "keyword"
}
}
}
}
}