参考博文和链接均在下面,有些细节没有写,自行百度
感谢各位前辈大佬
一:安装MySQL及可视化工具Workbench
安装MySQL:
http://blog.csdn.net/lina_acm/article/details/51810898
查验MySQL:
sudo netstat -tap | grep mysql
安装MySQL:
sudo apt-get install mysql-server
sudo apt-get install mysql-client
sudo apt-get install libmysqlclient-dev
安装mysql可视化工具Workbench:
http://blog.csdn.net/jgirl_333/article/details/48575281
sudo apt-get install mysql-workbench
安装mysqldb(驱动):
sudo apt-get install python-mysqldb
http://blog.csdn.net/boycycyzero/article/details/42787797
pip install mysql-python
# import MySQLdb
Python爬取数据并写入MySQL数据库
http://blog.csdn.net/Oscer2016/article/details/70257956?locationNum=9&fps=1
二:插入中文长字符串可能遇到的问题
库内容的存储类型都需要是utf8
问题描述:
在向表格中插入中文时出现了这样的报错
解决方法
在建立数据库的时候collation做如下选择 点击Schema右边的双向下的箭头,就会显示出被收起的Collation选项,我这里默认是latin1 - default collation 需要做下下图所示的修改
修改为utf8 - default collation 可能根据存储的不同需要需要选择不同的保存格式,这点尤其重要
存储长字符串,都改为utf8 - default collation 同时采用LONGTEXT
mysql中[Err] 1366 - Incorrect string value: '\xE5\x8D\x问题
问题描述:向数据库插入数据时出现:
[Err] 1366 -Incorrect string value: '\xE5\x8D\x8E\xE5\xB8\x88' for column 'uaddr' at row 1
解决办法:将该表中的每一列的字符集都改为utf-8
三:Python调用MySQL
主要参考:
python下的MySQLdb使用
http://drizzlewalk.blog.51cto.com/2203401/448874
python操作mysql数据库python-mysql.html
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 31 16:21:97 2017
@author: azurewit
"""
#073101 删除重复链接
#073103 获取可接受答案
#080202 添加表名全局变量
import time
import urllib
import re
import requests
import MySQLdb
from bs4 import BeautifulSoup
def get_page(Q_No, url, data = None):
global page_question_No, Attempts
#获取URL的requests
wb_data = requests.get(url)
wb_data.encoding = ('gbk')
soup = BeautifulSoup(wb_data.text, 'lxml')
#定义爬取的数据
webdata = soup.select('a.ti')
answer_sum = list(soup.find_all('dd', class_ = "dd answer"))
if data==None:
for title_pre, url, answer_pre in zip(webdata, webdata, answer_sum):
data = [
title_pre.get_text(),
url.get('href'),
answer_pre.get_text()
]
#进入问答页面
url_sub = data[1]
if url_sub in new_urls :
continue
if url_sub not in new_urls :
print '\n正在第 %d 次尝试获取问答对……' % (Attempts)
Attempts += 1
new_urls.add(url_sub)
wb_data_sub = requests.get(url_sub)
wb_data_sub.encoding = ('gbk')
soup_sub = BeautifulSoup(wb_data_sub.text, 'lxml')
title = soup_sub.find('span', class_ = "ask-title ")
best_answer = soup_sub.find('pre', class_ = "best-text mb-10")
img_answer = soup_sub.find('img', class_ = "word-replace")
if title != None:
if best_answer != None :
question_now = title.get_text()
if img_answer != None:
best = data[2]
type_st = 'point 1-1'
else:
best = best_answer.get_text()
type_st = 'point 1-2'
elif best_answer == None:
question_now = title.get_text()
best_answer = soup_sub.find('div', class_ = "best-text mb-10")
if best_answer != None :
if img_answer != None:
best = data[2]
type_st = 'point 2-1'
else:
best = best_answer.get_text()
type_st = 'point 2-2'
else:
better_answer = soup_sub.find('div', class_ = "answer-text line")
if better_answer != None:
if img_answer != None:
better = data[2]
best = better
type_st = 'point 2-3'
else:
best = better_answer.get_text()
type_st = 'point 2-4'
else:
better_answer = soup_sub.find('div', class_ = "answer-text mb-10")
if img_answer != None:
better = data[2]
best = better
type_st = 'point 2-5'
elif better_answer != None:
best = better_answer.get_text()
type_st = 'point 2-6'
else :
best = data[2]
type_st = 'point 2-7'
else:
question_now = data[0]
best = data[2]
type_st = 'point 3-1'
haskeyword = re.search(rekey_word, question_now.encode("UTF-8"))
has3points = re.search(re_3points, best)
if haskeyword == None or has3points != None:
continue
else:
page_now = page_question_No
page_question_No += 1
print '\n===================\n爬取的第 %d 问答对为\
:\n===================\n' % (page_question_No)
print question_now
print best
cursor = db.cursor()
question_j = question_now.encode("UTF-8")
best_j = best.encode("UTF-8")
type_j = type_st.encode("UTF-8")
keyword_j = key_word
sql = "INSERT INTO test_table (ID, KEYWORD, LINK, TYPE, QUESTION, ANSWER) \
VALUES ('%d', '%s', '%s', '%s', '%s', '%s')" % \
( page_now, keyword_j, url_sub, type_j, question_j , best_j)
try:
cursor.execute(sql)
db.commit()
except:
db.rollback()
time.sleep(1)
#迭代页数
def get_more_page(start, end):
for one in range(start, end, 10):
get_page(one,url+str(one))
time.sleep(1)
#主体
global new_urls, re_3points, Attempts, key_word, rekey_word, table_name
new_urls = set()
page_question_No = 0
Attempts = 1
re_3points = '\.{3}$'
table_name = 'test_table'
print('连接到mysql服务器...')
db = MySQLdb.connect("localhost","root","你的mySQL密码","azure_schema" ,charset='utf8')
print('连接上了!')
cursor_pre = db.cursor()
sql = "DELETE FROM test_table "
try:
# 执行SQL语句
cursor_pre.execute(sql)
# 提交MySQL
db.commit()
except:
# 发生错误时回滚
db.rollback()
db.close()
db = MySQLdb.connect("localhost", "root", "你的mySQL密码", "azure_schema", charset='utf8')
print('清空并连接上了!')
#定义爬取关键词、页数
#key_word = raw_input('请输入关键词\n')
key_words =(
'深度学习','自动驾驶','ImageNet','机器视觉','图像识别',\
'机器学习','正则化','卷积神经网络','数据稀疏','稀疏编码',\
'循环神经网络','递归神经网络','无人驾驶','逻辑回归','前向信号计算',\
'自学习聚类','遗传算法','朴素贝叶斯算法','智能算法','人脸识别',\
'PageRank算法','最近邻分类算法','Kmeans算法','AdaBoost算法','SVM 支持向量机',\
'CART分类','回归树','自编码器','图像检测','OCR',\
'TensorFlow','AdaBoot','caffe','torch','MXNet',\
'theano','python','DeepMind','聚类算法','贝叶斯方法',\
'人脸标注','逻辑感知','数据分析','数据挖掘','贝叶斯统计',\
'博弈论','指纹识别','聊天机器人','AlphaGo','大数据',\
'云计算','物联网','人工智能','智能机器人','语言识别',\
'自然语言处理','专家系统','自然语言理解', 'OpenCV', '图像处理',\
'颜色空间','多分辨处理','形态学处理','图像拼接','并行计算',\
'GPU加速','数据结构','声学特征提取','声纹识别','线性预测',\
'模板匹配','语言模型','声纹注册','HMM模型','模式识别',\
'特征提取','数据预处理','模型评估','回归算法','分类算法',\
'图形API','虚拟现实','降维算法','人机交互','受限波尔兹曼模型',\
'数据管理平台','知识图谱','随机森林','关联规则学习','计算机视觉',\
'生物特征识别','搜索引擎','凸分析','算法复杂度','Boosting',\
'逻辑语句','语义网络','决策树','信息过滤系统','数据库'
)
#pages = input('每个关键字的总页面数: \n')
pages = 10
for key_word in key_words:
rekey_word = '\w*%s\w*' % (key_word)
#定义将要爬取的URL
url = "https://zhidao.baidu.com/search?word=" + urllib.quote(key_word)+"&pn="
Page_star = 0
Page_end = int(pages)*10
get_more_page(Page_star, Page_end)
print '********************\n完成获取关键字为: %s \
\n目前共获取问答对: %d \n********************' \
% (key_word, page_question_No)
print '********************\n完成获取: %d \
\n********************' % (page_question_No)
db.close()
有错的话,(*^__^*) 嘻嘻……
反弹,biubiubiu~~~