闲来无事用Python写了点小工具
@Author:Quinn chen
@Data :2017/8/24
使用pycharm和Anaconda3,python版本2.7
目录结构,黑线划掉是不需要的
1.利用request来抓取baidu热点信息
使用了两次正则分别匹配值热点的标题和URl
将获取的数据保存为json格式的文件,引入了json的model
<hr>
# -*- coding: UTF-8 -*-
import requests
import sys
import re
from tool.json.jsontool import *
# 下面三行是编码转换的功能
reload(sys)
sys.setdefaultencoding("utf-8")
def catchbadu(traget_url):
# hea是我们自己构造的一个字典,里面保存了user-agent。
# 让目标网站误以为本程序是浏览器,并非爬虫。
# 从网站的Requests Header中获取。【审查元素】
hea = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36',
'Accept': 'text / html, application / xhtml + xml, application / xml q = 0.9, * / *;q = 0.8',
'Accept - Language': 'en - US, en;q = 0.5'
}
html = requests.get(traget_url, headers=hea)
html.encoding = 'gb2312' # 这一行是将编码转为utf-8否则中文会显示乱码。
#[\u2E80-\u9FFF]+$
# 此为正则表达式部分。找到规律,利用正则,内容就可以出来
urls = [['hot标题', 'hot网址']];
title = re.findall('<td class="keyword">(.*?)</td>', html.text, re.S)
print title;
for each in title:
content = [];
url = re.search('http://(.*?)"',each,re.S).group().rstrip('\"')
top = re.search('>(.*?)<', each, re.S).group().strip("<|>")
print top +" "+ url
content.append(top.encode('utf-8'))
content.append(url)
urls.append(content)
print urls
return urls
store(file,data)
# with open("./baidunews.json",'w') as json_file:
# json.dump(urls,json_file,ensure_ascii=False)
if __name__ == "__main__" :
urls = catchbadu('http://top.baidu.com/buzz?b=1&fr=tph_right');
store("./baidunews.json",urls)
print load("./baidunews.json")
#tool.json.jsontool使用到的json工具
# -*- coding: UTF-8 -*-
import json
def store(file,data):
with open(file, 'w') as json_file:
json_file.write(json.dumps(data))
def load(file):
with open(file) as json_file:
data = json.load(json_file)
return data
2.python存储数据到数据库
-实现数据库连接,需要提供参数
-对给定的json数据自动解析json头格式,创建数据库和数据表
-功能 :数据库的数据添加,库创建,表创建,注释已经比较清楚了
#mysql.py
# -*- coding:utf-8 -*-
import pymysql
import types
import os
from tool.json.jsontool import *
#数据库参数
host="localhost"
user='root'
passwd='root'
port=3306
charset='UTF8'
# 打开数据库连接,返回连接对象
def connectMysql(host,user,passwd,port,charset):
return pymysql.connect(host=host,user=user,passwd=passwd,port=port,charset=charset)
#判断数据库是否存在,存在返回1
def isdbVailable(conn,db):
curs = conn.cursor()
return (curs.execute("SELECT * FROM information_schema.SCHEMATA where SCHEMA_NAME = '%s'" % db ))
def isTableVailableUnderDb(conn,db,table):
curs = conn.cursor()
return (curs.execute("SELECT * FROM information_schema.TABLES where TABLE_NAME='%s' and TABLE_SCHEMA = '%s'" % (table ,db)))
#conn 链接对象,db数据库,字段的list
#添加新的字段名
def createNewTable(conn,db):
curs = conn.cursor()#获取游标
try:
if not isdbVailable(conn,db):
curs.execute("create database %s " % db);
except Exception, e:
print e
print 'Database %s exists!' % db
conn.select_db(db)
# create a table named addfields
try:
if not isTableVailableUnderDb(conn,db,table):
curs.execute("""CREATE TABLE %s (
hot_id int(5) UNSIGNED NOT NULL AUTO_INCREMENT ,PRIMARY KEY (hot_id))""" % table
);
print "表创建成功"
conn.commit()
except Exception, e:
print e
conn.rollback()
#修改表结构,添加新的字段
def alterTable(conn,db,table,list):
curs = conn.cursor() # 获取游标
try:
for filed in list:
curs.execute("alter table %s add %s text" % (table,filed))
print "表结构更新成功"
except Exception, e:
print e
print "字段可能已经存在"
#插入新的数据,传入参数是json格式文件路径
def insertData(conn,db,table,jsondata):
cursor = conn.cursor()
baiduhot = load(jsondata)
#更改表结构来存储数据
alterTable(conn,db,table,baiduhot[0])
sql_param = ""
sql_param_num = ("%s,"*len(baiduhot[0])).rstrip(" |\,")
# print sql_param_num
for param in baiduhot[0]:
sql_param = sql_param + param +","
sql_param=sql_param.rstrip(" \,")
# print sql_param
for one in baiduhot[1:]:
print one[0]
print one[1]
sql = "INSERT INTO " + table + "("+sql_param+") VALUES (" + sql_param_num + " )"
# print sql
cursor.execute(sql,one)
# 提交到数据库执行
conn.commit()
def queryAllData(conn,db,table):
conn.select_db(db)
curs = conn.cursor() # 获取游标
curs.execute("select * from %s" % table);
# 获取所有记录列表
results = curs.fetchall()
for one in results:
for part in one:
if isinstance(part,types.IntType):
print part
else:
print part.encode('utf-8')
#提交操作,关闭连接
def save(conn):
# this is very important
curs=conn.cursor()
conn.commit()
curs.close()
conn.close()
if __name__ == "__main__":
conn = connectMysql(host,user,passwd,port,charset);
db = "dbbaiduhot";
table = "tbaiduhot"
jsondata = os.path.dirname(os.getcwd())+"\\..\\web\\baidunews.json"
createNewTable(conn,db)
insertData(conn, db, table, jsondata)
#
# queryAllData(conn,db,table);
# save(conn)