数据爬取存入MySQL数据库_将爬取的数据存入mysql数据库-CSDN博客

本文链接：https://blog.csdn.net/qq_33267306/article/details/124966083

self.db = pymysql.connect(
            host='localhost',
            user='你的用户名', 
            password='你的密码',
            db='test', 
            port=3306)
self.cursor=self.db.cursor()
#如果存在table1表，则删除
self.cursor.execute("DROP TABLE IF EXISTS table1")
#创建table1表
crea_table = '''
create table table1(
    class_url varchar(150),
    pic_url varchar(150),
    class_name varchar(150),
    people_nums int)'''
try:
    # 执行SQL语句
    self.cursor.execute(crea_table)
    print("创建数据表成功")
except Exception as e:
    print("创建数据表失败：case%s"%e)
finally:
    #关闭游标连接
    self.cursor.close()
    # 关闭数据库连接
    self.db.close()

将爬虫数据存入table1表中

'''
数据爬取
'''
db1 = pymysql.connect(
        host='localhost',
        user='你的用户名', 
        password='你的密码',
        db='test', 
        port=3306)
cursor1=db1.cursor()
headers = {
     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
     }
 
html = requests.get(url,headers=headers).content.decode('utf-8')
 
respose = etree.HTML(html)
lists = respose.xpath('//div[@class="list max-1152 clearfix"]/a[@class="item free "]')
#插入数据
for lis in lists:
     #课程链接
    c_url = 'http:'+lis.xpath('./@href')[0] 
     #图片链接
    p_url = 'http:'+ lis.xpath('./div[@class="img"]/@style')[0].split("url('")[-1].split("')")[0]
     #课程名字
    name =  lis.xpath('./p[@class="title ellipsis2"]/text()')[0]
     #学习人数
    print(name)
    num =  int(lis.xpath('./p[@class="one"]/text()')[0].split('· ')[-1].split('人')[0])
    insert1 = "insert into table1(class_url,pic_url,class_name,people_nums)\
    VALUES('%s','%s','%s', '%d')"%(c_url,p_url,name,num)
    try:
      # 执行sql语句
        cursor1.execute(insert1)
      # 执行sql语句
        db1.commit()
    except:
      # 发生错误时回滚
        db1.rollback()
#关闭游标和数据库
cursor1.close()
db1.close()

总的代码


import requests
import time
from lxml import etree
import pymysql


'''
这是一个爬取数据保存至数据库的demo
'''
class Spider_database:
    def __init__(self):
        '''
        创建数据库
        '''
        self.mydb = pymysql.connect(
        host="localhost",
        user="你的用户名",
        password="你的密码"
        )
    
        self.mycursor = self.mydb.cursor()  #游标
    
        self.mycursor.execute("CREATE DATABASE if not exists test") #创建名为test的数据库
        
        '''
        创建表
        '''
        self.db = pymysql.connect(
            host='localhost',
            user='你的用户名', 
            password='你的密码',
            db='test', 
            port=3306)
        self.cursor=self.db.cursor()
        #如果存在table1表，则删除
        self.cursor.execute("DROP TABLE IF EXISTS table1")
        #创建table1表
        crea_table = '''
        create table table1(
            class_url varchar(150),
            pic_url varchar(150),
            class_name varchar(150),
            people_nums int)'''
        try:
            # 执行SQL语句
            self.cursor.execute(crea_table)
            print("创建数据表成功")
        except Exception as e:
            print("创建数据表失败：case%s"%e)
        finally:
            #关闭游标连接
            self.cursor.close()
            # 关闭数据库连接
            self.db.close()
            
    def spyder(self,url):
        
        '''
        数据爬取
        '''
        db1 = pymysql.connect(
                host='localhost',
                user='你的用户名', 
                password='你的密码',
                db='test', 
                port=3306)
        cursor1=db1.cursor()
        headers = {
             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
             }
         
        html = requests.get(url,headers=headers).content.decode('utf-8')
         
        respose = etree.HTML(html)
        lists = respose.xpath('//div[@class="list max-1152 clearfix"]/a[@class="item free "]')
        #插入数据
        for lis in lists:
             #课程链接
            c_url = 'http:'+lis.xpath('./@href')[0] 
             #图片链接
            p_url = 'http:'+ lis.xpath('./div[@class="img"]/@style')[0].split("url('")[-1].split("')")[0]
             #课程名字
            name =  lis.xpath('./p[@class="title ellipsis2"]/text()')[0]
             #学习人数
            print(name)
            num =  int(lis.xpath('./p[@class="one"]/text()')[0].split('· ')[-1].split('人')[0])
            insert1 = "insert into table1(class_url,pic_url,class_name,people_nums)\
            VALUES('%s','%s','%s', '%d')"%(c_url,p_url,name,num)
            try:
              # 执行sql语句
                cursor1.execute(insert1)
              # 执行sql语句
                db1.commit()
            except:
              # 发生错误时回滚
                db1.rollback()
        #关闭游标和数据库
        cursor1.close()
        db1.close()

if __name__ == '__main__':
    s = Spider_database()
    for i in range(1,3):
        url = 'http://www.imooc.com/course/list?page={}'.format(i)
        s.spyder(url)

查看数据库中的数据

1.通过MySQL Server查看，不会的可以查看我之前发的https://download.csdn.net/download/qq_33267306/48191430?spm=1001.2014.3001.5503

在这里插入图片描述

2.通过代码查看

import pymysql

def query_data():
    db = pymysql.connect(
        host='localhost',
        user='你的用户名',
        password='你的密码',
        db='test',
        port=3306)
    cursor = db.cursor()
    # 查询
    query = "select * from table1"


    try:

        # 执行SQL语句
        cursor.execute(query)
        # 获取所有记录列表
        results = cursor.fetchall()
        for row in results:
            class_url = row[0]
            pic_url = row[1]
            name = row[2]
            num = row[3]

            # 打印结果
            print(class_url,pic_url,name,num)
    except Exception as e:
        print(e)

    finally:
        db.close()

    # 关闭数据库连接

    return results

if __name__ == '__main__':
    aa = query_data()