数据库的安装教程
https://blog.csdn.net/m0_63451989/article/details/131948723?ops_request_misc=%257B%2522request%255Fid%2522%253A%2522169957815816800192248363%2522%252C%2522scm%2522%253A%252220140713.130102334.pc%255Fblog.%2522%257D&request_id=169957815816800192248363&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2blogfirst_rank_ecpm_v1~rank_v31_ecpm-5-131948723-null-null.nonecase&utm_term=mysql%208.2.0&spm=1018.2226.3001.4450
数据库密码忘记
https://blog.csdn.net/m0_46278037/article/details/113923726
安装数据库以后,进入数据库,创建database
mysql> create database test;
参考基本语句:https://blog.csdn.net/m0_60494863/article/details/124364800
在python里测试连接数据库
import pymysql
conn = pymysql.connect(host="127.0.0.1", user="root", password="*****", database="test",charset="utf8")
if conn:
print("true")
爬虫练习4
爬取豆瓣电影前n页,抓取其中的电影名和评分,并生成表格,新增添加信息进数据库
import sys
import time
import pymysql
import requests
from bs4 import BeautifulSoup
from lxml import etree
#连接本地数据库,本地ip127.0.0.1,我的数据库名和密码如下,已建立数据库test,test下建立的表格douban3,其中两列name和score都是字符串类型
#create table douban3(-> name VARCHAR(20),-> score VARCHAR(20)-> ); 查询mysql> select*from douban3;查询表格结构mysql> show columns from douban3;
try:
conn = pymysql.connect(host="127.0.0.1", user="root", password="*****", database="test",charset="utf8")
except pymysql.OperationalError as msg:
print("error: Could not Connection SQL Server!please check your dblink configure!")
sys.exit()
else:
cur = conn.cursor()
print('mysqlok')
url = "https://movie.douban.com/top250?start="
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36",
}
#定义first函数获取网页源代码response解析后放入x
def first(url):
# 函数外要使用x定义成global
global x
try:
response = requests.get(url, headers=headers)
# 防止中文乱码
response.encoding = 'utf-8'
# 它能够判断返回的Response类型状态是不是200。如果是200,他将表示返回的内容是正确的,如果不是200,他就会产生一个HttpError的异常。
if response.status_code == 200:
x = BeautifulSoup(response.text, 'lxml')
# print(response.text)
except:
print("爬取失败")
#输入要爬取多少页
a=input("第n页")
#定义两个空列表方便存放爬取结果
listname=[]
listscore=[]
#从第一页开始爬取数据,每次在url后面加数字再调用first函数爬取,电影的名字和评分放入name和score,第二页的再添加到listname中
for n in range(0, int(a)):
#n是0的时候就是第一页,n是1的时候就是第二页的url,观察url的规律得来
count = n * 25
url = 'https://movie.douban.com/top250?start=' + str(count)
#调用first获取源代码放入x
first(url)
#x编码转成网页文件
bytes_obj = x.prettify().encode()
with open('douban.html', 'wb') as f:
f.write(bytes_obj)
print('ok')
parser = etree.HTMLParser(encoding='utf-8')
#解析网页文件,找出所要爬的数据内容的路径
html = etree.parse('douban.html', parser=parser)
name = html.xpath("//body/div/div/div/div/ol/li/div/div/div/a/span[1]/text()")
score = html.xpath("//body/div/div/div/div/ol/li/div/div/div/div/span[2]/text()")
#添加一个功能,尝试把结果存入数据库,只存3个,有时候输出新增失败但是数据库里确有添加
for i in range(3):
sql = "insert into douban3(name,score) values('" + name[i]+ \
"','" + score[i] + "') "
try:
cur.execute(sql)
conn.commit()
print("第" +i+ "条电影信息新增成功")
time.sleep(30)
except:
conn.rollback()
print("新增失败:" + sql)
#由于想要把多页添加在一个列表里,例如第一页爬出结果放入listname,第二页爬出来会追加在listname里。score也是一样
listname=name+listname
listscore=score+listscore
print(listname)
print(listscore)
#去掉空格
listname2=[]
for i in listname:
i = i.strip()
listname2.append(i)
print(listname2)
listscore2 = []
for i in listscore:
i = i.strip()
listscore2.append(i)
print(listscore2)
list2 = []
#为了表格输出,整合两个列表,将第一个电影名读出放入list1,第一个分数接着放入list1,此时list1中[‘xxx’,9],再将list1放入list2,也就是list2的第一项是一个列表包含电影名1,分1
for i in range(len(listname)):
list1 = []
list1.append(listname2[i])
list1.append(listscore2[i])
list2.append(list1)
print(list2)
#设计一个表头title
title = ['电影名', '评分']
list3 = []
#打包,装上表头,list2的每一项跟title打包成字典,追加到list3里
for i in range(len(list2)):
dic = dict(zip(title, list2[i]))
list3.append(dic)
print(list3)
#出表格
import pandas
df = pandas.DataFrame(list3)
print(df)
df.to_excel('douban.xlsx')