涉及到爬虫数据处理、数据清洗、数据查询、数据入库等操作时,则需要进行数据库的连接,以便进行数据查、校验、入库等操作;
此处示例在python中,如何配置和连接数据库Mysql、Oracle、MSsql、Elasticsearch,以及代码执行方式(对应package包的安装比较简单就自行安装,这里只说代码层面的)
1、连接Mysql
import pymysql
# 创建数据库连接
con = pymysql.connect(
host='127.0.0.1',
port=3306,
user='root',
password='123456',
db='test',
charset='utf8'
)
# 创建游标
cur = con.cursor()
# 执行sql语句
cur.execute('show tables')
# 获取sql语句全部result
# 获取一条数据:table=cursor.fetchone()
table = cur.fetchall()
# 关闭游标
cur.close()
# 关闭数据库连接
con.close()
2、连接Oracle
import cx_Oracle
# 创建数据库连接
con = cx_Oracle.connect('root', 'root123', '127.0.0.1:1521/ORCL')
# 创建游标
cursor = con.cursor()
# 执行sql语句
cursor.execute("select * from ORCL where ID='1'")
# 获取一条数据
data = cursor.fetchone()
# 关闭游标
cursor.close()
# 关闭数据库连接
con.close()
3、连接MSsql
import pymssql
# 创建数据库连接
con = pymssql.connect(
host='localhost',
user='sa',
password='123456',
database='pubs'
)
# 创建游标
cur = con.cursor()
# 执行sql语句
cur.execute('SELECT * FROM titles')
# 获取sql语句所有result
res = cur.fetchall()
# 关闭游标
cur.close()
# 关闭数据库连接
con.close()
4、连接Elasticsearch(这里索引操作介绍的有点多,有兴趣的可以看看)
from elasticsearch import Elasticsearch
es=Elasticsearch(["localhost:9200"])
es_index = "index_spiderdata_iimedia_theme_data"
# 新建索引
try:
create_res=es.indices.create(index=es_index)
print(create_res)
except Exception as err:
print(err)
# 删除索引
es.indices.delete(index=es_index)
mappings = {
"mappings": {
"type_doc_test": { #type_doc_test为doc_type
"properties": {
"id": {
"type": "long",
"index": "false"
},
"serial": {
"type": "keyword", # keyword不会进行分词,text会分词
"index": "false" # 不建索引
},
#tags可以存json格式,访问tags.content
"tags": {
"type": "object",
"properties": {
"content": {"type": "keyword", "index": True},
"dominant_color_name": {"type": "keyword", "index": True},
"skill": {"type": "keyword", "index": True},
}
},
"hasTag": {
"type": "long",
"index": True
},
"status": {
"type": "long",
"index": True
},
"createTime": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"
},
"updateTime": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"
}
}
}
}
}
res = es.indices.create(index = 'index_test',body =mappings)
# 查询
query = {
'query': {
'bool': {
'must': [
{'match': {'_all': 'python web'}}
],
'filter': [
{'term': {'status': 2}}
]
}
}
}
ret = es.search(index='articles', doc_type='article', body=query)
# 插入
doc = {
'article_id': article.id,
'user_id': article.user_id,
'title': article.title
}
es.index(index='articles', doc_type='article', body=doc, id=article.id)
# 修改 es.update()
# 删除 es.delete()
res = es.delete(index="index_test",doc_type="doc_type_test", id ="bSlegGUBmJ2C8ZCSC1R1")
以上就是在python中,进行Mysql、Oracle、MSsql、Elasticsearch等数据库连接的操作演示