Python爬虫

本文介绍了使用Python语言编写网络爬虫,爬取特定网站信息并存储到MySQL数据库的过程。数据包括文字和图片,图片路径存储在数据库,实际资源保存在本地文件夹。涉及的技术点包括Python爬虫编程和数据库操作。
摘要由CSDN通过智能技术生成

基本要求:
选择一个主题,用Python语言编写一个网络爬虫程序,将文字和图像等信息抓取到MySQL中保存,(如果有图片数据,图片数据可以只在数据库存放路径,图片资源存储到文件夹)。
(1)主题说明:
1.小说/热点/电商/交通/科技/财经/房产/本地/健康/影视/法制/军事/科学/国际/电影/漫画/游戏/养生/宠物/家居/历史/数码/摄影/三农/旅游/星座/体育(可以分各个体育项目)/股票/音乐/教育

这里选择爬取一个个人网站,爬取到的信息存储在MySQL数据库里,图片放在本地,图片路径存储在mysql数据库,代码如下:

数据库

/*
 Navicat Premium Data Transfer

 Source Server         : localhost_3306
 Source Server Type    : MySQL
 Source Server Version : 50623
 Source Host           : localhost:3306
 Source Schema         : gem_db

 Target Server Type    : MySQL
 Target Server Version : 50623
 File Encoding         : 65001

 Date: 11/01/2022 17:21:15
*/

SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;

-- ----------------------------
-- Table structure for about
-- ----------------------------
DROP TABLE IF EXISTS `about`;
CREATE TABLE `about`  (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `about_img` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
  PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 2 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Compact;

-- ----------------------------
-- Table structure for about_info
-- ----------------------------
DROP TABLE IF EXISTS `about_info`;
CREATE TABLE `about_info`  (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `title` tinytext CHARACTER SET utf8 COLLATE utf8_general_ci NULL,
  `content` tinytext CHARACTER SET utf8 COLLATE utf8_general_ci NULL,
  `aid` int(11) NULL DEFAULT NULL,
  PRIMARY KEY (`id`) USING BTREE,
  INDEX `aid`(`aid`) USING BTREE,
  CONSTRAINT `aid` FOREIGN KEY (`aid`) REFERENCES `about` (`id`) ON DELETE CASCADE ON UPDATE CASCADE
) ENGINE = InnoDB AUTO_INCREMENT = 17 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Compact;

-- ----------------------------
-- Table structure for awards
-- ----------------------------
DROP TABLE IF EXISTS `awards`;
CREATE TABLE `awards`  (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `year` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
  `content` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
  PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 71 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Compact;

-- ----------------------------
-- Table structure for calendar
-- ----------------------------
DROP TABLE IF EXISTS `calendar`;
CREATE TABLE `calendar`  (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `calendar_img` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
  `date` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
  `content` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
  `address` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
  PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 63 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Compact;

-- ----------------------------
-- Table structure for charity
-- ----------------------------
DROP TABLE IF EXISTS `charity`;
CREATE TABLE `charity`  (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `content` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
  PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 2 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Compact;

-- ----------------------------
-- Table structure for charity_img
-- ----------------------------
DROP TABLE IF EXISTS `charity_img`;
CREATE TABLE `charity_img`  (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `charity_img` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
  `cid` int(11) NULL DEFAULT NULL,
  PRIMARY KEY (`id`) USING BTREE,
  INDEX `cid`(`cid`) USING BTREE,
  CONSTRAINT `cid` FOREIGN KEY (`cid`) REFERENCES `charity` (`id`) ON DELETE CASCADE ON UPDATE CASCADE
) ENGINE = InnoDB AUTO_INCREMENT = 19 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Compact;

-- ----------------------------
-- Table structure for honour
-- ----------------------------
DROP TABLE IF EXISTS `honour`;
CREATE TABLE `honour`  (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `content` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
  PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 2 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Compact;

-- ----------------------------
-- Table structure for honour_img
-- ----------------------------
DROP TABLE IF EXISTS `honour_img`;
CREATE TABLE `honour_img`  (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `honour_img` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
  `hid` int(11) NULL DEFAULT NULL,
  PRIMARY KEY (`id`) USING BTREE,
  INDEX `hid`(`hid`) USING BTREE,
  CONSTRAINT `hid` FOREIGN KEY (`hid`) REFERENCES `honour` (`id`) ON DELETE CASCADE ON UPDATE CASCADE
) ENGINE = InnoDB AUTO_INCREMENT = 19 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Compact;

-- ----------------------------
-- Table structure for tour
-- ----------------------------
DROP TABLE IF EXISTS `tour`;
CREATE TABLE `tour`  (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `tour_img` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
  `title` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
  PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 4 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Compact;

-- ----------------------------
-- Table structure for tour_info
-- ----------------------------
DROP TABLE IF EXISTS `tour_info`;
CREATE TABLE `tour_info`  (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `date` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
  `address` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
  `tid` int(11) NULL DEFAULT NULL,
  PRIMARY KEY (`id`) USING BTREE,
  INDEX `tid`(`tid`) USING BTREE,
  CONSTRAINT `tid` FOREIGN KEY (`tid`) REFERENCES `tour` (`id`) ON DELETE CASCADE ON UPDATE CASCADE
) ENGINE = InnoDB AUTO_INCREMENT = 517 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Compact;

SET FOREIGN_KEY_CHECKS = 1;




爬虫程序代码

from contextlib import closing

import pymysql
import requests
from bs4 import BeautifulSoup

class crawler():
    # 数据库连接
    db = None
    def __init__(self):
        # 网站地址
        self.url="https://iamgem.com/"
        # 加入反爬虫
        self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36'}

    def init_db(self):
        # 打开数据库连接
        self.db = pymysql.connect(host="localhost", user="root", password="root", database="gem_db")


    def get_calendar(self):
        print('正在获取日历...')
        # 获取 clendar html网页
        res = requests.get(url="https://iamgem.com/calendar",headers=self.headers)

        # 使用BeautifulSoup解析
        bf = BeautifulSoup(res.text)

        # 获取日历信息项
        texts = bf.find_all('div', class_ = 'entry')

        # 遍历解析结果集,得到每一个信息项
        for item in texts:
            # 获取图片
            img=item.findNext('img').get('src')
            # 获取图片文件名
            imgname = item.findNext('img').get('src')
            index = 0
            if imgname.rfind('/') != -1:
                index = imgname.rfind('/') + 1
            imgname=imgname[index:]

            # 保存图片
            self.save_img(self.url+img,imgname)

            # 获取日历信息
            calendar=str(item.findNext('p')).replace('<p>','').replace('</p>','').replace('<br/>','').replace('<b>','\n').replace('</b>','\n')
            calendar=calendar.split('\n')

            # 保存到数据库
            # 使用cursor()方法获取操作游标
            cursor = self.db.cursor()

            # SQL 插入语句
            sql = """INSERT INTO calendar(calendar_img,
                     date, content, address)
                     VALUES ('%s', '%s', '%s', '%s')""" % (imgname,calendar[0],calendar[1],calendar[2])

            print(sql)

            try:
                # 执行sql语句
                cursor.execute(sql)
                # 提交到数据库执行
                self.db.commit()
            except:
                # 如果发生错误则回滚
                self.db.rollback()

        print('获取日历完成!')



    def get_about(self):
        print('正在获取艺人简介...')

        # 获取 about html网页
        res = requests.get(url="https://iamgem.com/about", headers=self.headers)

        # 使用BeautifulSoup解析
        bf = BeautifulSoup(res.text)

        # 获取about图片
        texts = bf.find_all('div', class_='col-sm-4 offset-sm-1 center')
        # 获取图片地址
        img = texts[0].findNext('img').get('src')
        # 获取图片文件名
        imgname = texts[0].findNext('img').get('src')
        index = 0
        if imgname.rfind('/') != -1:
            index = imgname.rfind('/') + 1
        imgname = imgname[index:]
        # 保存图片到本地
        self.save_img(self.url+img,imgname)
        # 保存到数据库
        # 使用cursor()方法获取操作游标
        cursor = self.db.cursor()

        cursor.execute("SELECT max(id) from about")
        data=cursor.fetchone()
        #获取id
        id = 1
        if data[0]!=None:

            id=data[0]+1

        # SQL 插入语句
        sql = """INSERT INTO about(id,about_img)
                             VALUES (%d,'%s')""" % (id,imgname)

        print(sql)

        try:
            # 执行sql语句
            cursor.execute(sql)

            # 提交到数据库执行
            self.db.commit()
        except:
            # 如果发生错误则回滚
            self.db.rollback()

        # 获取about内容
        p = bf.find_all('p')

        # 获取标题
        title = p[0].text.replace("<b>", "").replace("</b>", "")

        # 获取内容
        content = p[1].text.replace("<b>", "").replace("</b>", "")

        # SQL 插入语句
        sql = """INSERT INTO about_info(title,content,aid)
                                                 VALUES ("%s","%s",%d)""" % (title, content, id)
        print(sql)
        try:
            # 执行sql语句
            cursor.execute(sql)

            # 提交到数据库执行
            self.db.commit()
        except:
            # 如果发生错误则回滚
            self.db.rollback()

        # 获取标题
        title = p[2].text.replace("<b>", "").replace("</b>", "")
        # 获取内容
        content = p[3].text.replace("<b>", "").replace("</b>", "")

        # SQL 插入语句
        sql = """INSERT INTO about_info(title,content,aid)
                                                         VALUES ('%s','%s',%d)""" % (title, content, id)
        print(sql)

        try:
            # 执行sql语句
            cursor.execute(sql)

            # 提交到数据库执行
            self.db.commit()
        except:
            # 如果发生错误则回滚
            self.db.rollback()

    def get_tour(self):
        print('正在获取巡演信息...')

        # 获取 about html网页
        res = requests.get(url="https://iamgem.com/tour/", headers=self.headers)

        # 使用BeautifulSoup解析
        bf = BeautifulSoup(res.text)


        # 获取巡演结果集
        texts = bf.find_all('div', class_='col-12',style='height:100%;')

        for item in texts:
            div=item.findNext('div').find_all_next('div')

            # 获取图片地址
            img = item.findNext('img').get('src')

            # 获取图片文件名
            imgname = item.findNext('img').get('src')
            index = 0
            if imgname.rfind('/') != -1:
                index = imgname.rfind('/') + 1
            imgname = imgname[index:]
            # 保存图片到本地
            self.save_img(self.url + img, imgname)
            # 获取标题
            title = div[0].text.replace("<b>", "").replace("</b>", "")
            # 保存到数据库
            # 使用cursor()方法获取操作游标
            cursor = self.db.cursor()

            cursor.execute("SELECT max(id) from tour")
            data = cursor.fetchone()
            # 获取id
            id = 1
            if data[0] != None:
                id = data[0] + 1

            # SQL 插入语句
            sql = """INSERT INTO tour(id,tour_img,title)
                                         VALUES (%d,'%s','%s')""" % (id, imgname,title)

            print(sql)

            try:
                # 执行sql语句
                cursor.execute(sql)

                # 提交到数据库执行
                self.db.commit()
            except:
                # 如果发生错误则回滚
                self.db.rollback()

            #遍历巡回信息项

            divdate=item.findNext('div').find_all_next('div',class_='col-4 col-sm-2')
            addressdiv=item.findNext('div').find_all_next('div',class_="col-8 col-sm-4")
            for i in range(0,len(divdate)):

                # SQL 插入语句
                sql = """INSERT INTO tour_info(date,address,tid)
                                                                             VALUES ('%s','%s',%d)""" % (
                divdate[i].text.replace("<b>", "").replace("</b>", ""), addressdiv[i].text.replace("<b>", "").replace("</b>", ""), id)
                print(sql)

                try:
                    # 执行sql语句
                    cursor.execute(sql)

                    # 提交到数据库执行
                    self.db.commit()
                except:
                    # 如果发生错误则回滚
                    self.db.rollback()

        print('获取巡演信息完成!')

    def get_honour(self):
        print('正在获取荣誉信息...')

        # 获取 honour html网页
        res = requests.get(url="https://iamgem.com/honour/", headers=self.headers)

        # 使用BeautifulSoup解析
        bf = BeautifulSoup(res.text)

        #获取荣誉内容
        content = bf.find_all('div', class_='col-sm-2 col-12 p-4 p-sm-1')


        # 保存到数据库
        # 使用cursor()方法获取操作游标
        cursor = self.db.cursor()

        cursor.execute("SELECT max(id) from honour")
        data = cursor.fetchone()
        # 获取id
        id = 1
        if data[0] != None:
            id = data[0] + 1

        # SQL 插入语句
        sql = """INSERT INTO honour(id,content)
                                                 VALUES (%d,'%s')""" % (id, content[0].text.replace("<b>", "").replace("</b>", ""))

        print(sql)

        try:
            # 执行sql语句
            cursor.execute(sql)

            # 提交到数据库执行
            self.db.commit()
        except:
            # 如果发生错误则回滚
            self.db.rollback()


        # 获取荣誉结果集
        texts = bf.find_all('div', class_='grid-item')
        #遍历荣誉图片
        for item in texts:
            # 获取图片地址
            img = item.findNext('img').get('src')

            # 获取图片文件名
            imgname = item.findNext('img').get('src')
            index = 0
            if imgname.rfind('/') != -1:
                index = imgname.rfind('/') + 1
            imgname = imgname[index:]
            # 保存图片到本地
            self.save_img(self.url + img, imgname)

            # 保存到数据库
            # 使用cursor()方法获取操作游标
            cursor = self.db.cursor()

            # SQL 插入语句
            sql = """INSERT INTO honour_img(honour_img,hid)
                                                     VALUES ('%s',%d)""" % (imgname, id)

            print(sql)

            try:
                # 执行sql语句
                cursor.execute(sql)

                # 提交到数据库执行
                self.db.commit()
            except:
                # 如果发生错误则回滚
                self.db.rollback()


        print('获取荣誉信息完成!')

    def get_awards(self):
        print('正在获取奖项信息...')

        # 获取 awards html网页
        res = requests.get(url="https://iamgem.com/awards/", headers=self.headers)

        # 使用BeautifulSoup解析
        bf = BeautifulSoup(res.text)

        # 获取奖项结果集
        texts = bf.find_all('div', class_='tab-pane')
        #遍历奖项信息
        for item in texts:

            #获取日期
            date = item.get('id').replace('yr','')
            #获取内容
            content = item.text.replace("<b>", "").replace("</b>", "").replace("<p>", "").replace("</p>", "").replace("<h5>", "").replace("</h5>", "")
            # 保存到数据库
            # 使用cursor()方法获取操作游标
            cursor = self.db.cursor()

            # SQL 插入语句
            sql = """INSERT INTO awards(year,content)
                                                                 VALUES ('%s','%s')""" % (date, content)

            print(sql)

            try:
                # 执行sql语句
                cursor.execute(sql)

                # 提交到数据库执行
                self.db.commit()
            except:
                # 如果发生错误则回滚
                self.db.rollback()



        print('获取奖项信息完成!')

    def get_charity(self):
        print('正在获取公益信息...')

        # 获取 charity html网页
        res = requests.get(url="https://iamgem.com/charity/", headers=self.headers)

        # 使用BeautifulSoup解析
        bf = BeautifulSoup(res.text)

        # 获取公益内容
        content = bf.find_all('div', class_='col-sm-2 col-12 p-4 p-sm-1')

        # 保存到数据库
        # 使用cursor()方法获取操作游标
        cursor = self.db.cursor()

        cursor.execute("SELECT max(id) from charity")
        data = cursor.fetchone()
        # 获取id
        id = 1
        if data[0] != None:
            id = data[0] + 1

        # SQL 插入语句
        sql = """INSERT INTO charity(id,content)
                                                 VALUES (%d,'%s')""" % (
        id, content[0].text.replace("<b>", "").replace("</b>", ""))

        print(sql)

        try:
            # 执行sql语句
            cursor.execute(sql)

            # 提交到数据库执行
            self.db.commit()
        except:
            # 如果发生错误则回滚
            self.db.rollback()

        # 获取公益结果集
        texts = bf.find_all('div', class_='grid-item')
        # 遍历公益图片
        for item in texts:
            # 获取图片地址
            img = item.findNext('img').get('src')

            # 获取图片文件名
            imgname = item.findNext('img').get('src')
            index = 0
            if imgname.rfind('/') != -1:
                index = imgname.rfind('/') + 1
            imgname = imgname[index:]
            # 保存图片到本地
            self.save_img(self.url + img, imgname)

            # 保存到数据库
            # 使用cursor()方法获取操作游标
            cursor = self.db.cursor()

            # SQL 插入语句
            sql = """INSERT INTO charity_img(charity_img,cid)
                                                     VALUES ('%s',%d)""" % (imgname, id)

            print(sql)

            try:
                # 执行sql语句
                cursor.execute(sql)

                # 提交到数据库执行
                self.db.commit()
            except:
                # 如果发生错误则回滚
                self.db.rollback()

        print('获取公益信息完成!')

    def save_img(self,url,name):
        # 从网络上获取图片
        with closing(requests.get(url=url, headers=self.headers,stream=True)) as r:
            # 把图片保存到本地
            with open(name, 'ab+') as f:
                # 把图片写到文件
                for chunk in r.iter_content(chunk_size=1024):
                    if chunk:
                        f.write(chunk)
                        f.flush()

if __name__ == '__main__':
    craw = crawler()
    craw.init_db()
    craw.get_charity()
    craw.get_awards()
    craw.get_honour()
    craw.get_tour()
    craw.get_about()
    craw.get_calendar()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

轻澜-诀袂

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值