基本要求:
选择一个主题,用Python语言编写一个网络爬虫程序,将文字和图像等信息抓取到MySQL中保存,(如果有图片数据,图片数据可以只在数据库存放路径,图片资源存储到文件夹)。
(1)主题说明:
1.小说/热点/电商/交通/科技/财经/房产/本地/健康/影视/法制/军事/科学/国际/电影/漫画/游戏/养生/宠物/家居/历史/数码/摄影/三农/旅游/星座/体育(可以分各个体育项目)/股票/音乐/教育
这里选择爬取一个个人网站,爬取到的信息存储在MySQL数据库里,图片放在本地,图片路径存储在mysql数据库,代码如下:
数据库
/*
Navicat Premium Data Transfer
Source Server : localhost_3306
Source Server Type : MySQL
Source Server Version : 50623
Source Host : localhost:3306
Source Schema : gem_db
Target Server Type : MySQL
Target Server Version : 50623
File Encoding : 65001
Date: 11/01/2022 17:21:15
*/
SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;
-- ----------------------------
-- Table structure for about
-- ----------------------------
DROP TABLE IF EXISTS `about`;
CREATE TABLE `about` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`about_img` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 2 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Compact;
-- ----------------------------
-- Table structure for about_info
-- ----------------------------
DROP TABLE IF EXISTS `about_info`;
CREATE TABLE `about_info` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`title` tinytext CHARACTER SET utf8 COLLATE utf8_general_ci NULL,
`content` tinytext CHARACTER SET utf8 COLLATE utf8_general_ci NULL,
`aid` int(11) NULL DEFAULT NULL,
PRIMARY KEY (`id`) USING BTREE,
INDEX `aid`(`aid`) USING BTREE,
CONSTRAINT `aid` FOREIGN KEY (`aid`) REFERENCES `about` (`id`) ON DELETE CASCADE ON UPDATE CASCADE
) ENGINE = InnoDB AUTO_INCREMENT = 17 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Compact;
-- ----------------------------
-- Table structure for awards
-- ----------------------------
DROP TABLE IF EXISTS `awards`;
CREATE TABLE `awards` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`year` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`content` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 71 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Compact;
-- ----------------------------
-- Table structure for calendar
-- ----------------------------
DROP TABLE IF EXISTS `calendar`;
CREATE TABLE `calendar` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`calendar_img` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`date` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`content` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`address` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 63 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Compact;
-- ----------------------------
-- Table structure for charity
-- ----------------------------
DROP TABLE IF EXISTS `charity`;
CREATE TABLE `charity` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`content` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 2 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Compact;
-- ----------------------------
-- Table structure for charity_img
-- ----------------------------
DROP TABLE IF EXISTS `charity_img`;
CREATE TABLE `charity_img` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`charity_img` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`cid` int(11) NULL DEFAULT NULL,
PRIMARY KEY (`id`) USING BTREE,
INDEX `cid`(`cid`) USING BTREE,
CONSTRAINT `cid` FOREIGN KEY (`cid`) REFERENCES `charity` (`id`) ON DELETE CASCADE ON UPDATE CASCADE
) ENGINE = InnoDB AUTO_INCREMENT = 19 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Compact;
-- ----------------------------
-- Table structure for honour
-- ----------------------------
DROP TABLE IF EXISTS `honour`;
CREATE TABLE `honour` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`content` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 2 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Compact;
-- ----------------------------
-- Table structure for honour_img
-- ----------------------------
DROP TABLE IF EXISTS `honour_img`;
CREATE TABLE `honour_img` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`honour_img` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`hid` int(11) NULL DEFAULT NULL,
PRIMARY KEY (`id`) USING BTREE,
INDEX `hid`(`hid`) USING BTREE,
CONSTRAINT `hid` FOREIGN KEY (`hid`) REFERENCES `honour` (`id`) ON DELETE CASCADE ON UPDATE CASCADE
) ENGINE = InnoDB AUTO_INCREMENT = 19 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Compact;
-- ----------------------------
-- Table structure for tour
-- ----------------------------
DROP TABLE IF EXISTS `tour`;
CREATE TABLE `tour` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`tour_img` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`title` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 4 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Compact;
-- ----------------------------
-- Table structure for tour_info
-- ----------------------------
DROP TABLE IF EXISTS `tour_info`;
CREATE TABLE `tour_info` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`date` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`address` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`tid` int(11) NULL DEFAULT NULL,
PRIMARY KEY (`id`) USING BTREE,
INDEX `tid`(`tid`) USING BTREE,
CONSTRAINT `tid` FOREIGN KEY (`tid`) REFERENCES `tour` (`id`) ON DELETE CASCADE ON UPDATE CASCADE
) ENGINE = InnoDB AUTO_INCREMENT = 517 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Compact;
SET FOREIGN_KEY_CHECKS = 1;
爬虫程序代码
from contextlib import closing
import pymysql
import requests
from bs4 import BeautifulSoup
class crawler():
# 数据库连接
db = None
def __init__(self):
# 网站地址
self.url="https://iamgem.com/"
# 加入反爬虫
self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36'}
def init_db(self):
# 打开数据库连接
self.db = pymysql.connect(host="localhost", user="root", password="root", database="gem_db")
def get_calendar(self):
print('正在获取日历...')
# 获取 clendar html网页
res = requests.get(url="https://iamgem.com/calendar",headers=self.headers)
# 使用BeautifulSoup解析
bf = BeautifulSoup(res.text)
# 获取日历信息项
texts = bf.find_all('div', class_ = 'entry')
# 遍历解析结果集,得到每一个信息项
for item in texts:
# 获取图片
img=item.findNext('img').get('src')
# 获取图片文件名
imgname = item.findNext('img').get('src')
index = 0
if imgname.rfind('/') != -1:
index = imgname.rfind('/') + 1
imgname=imgname[index:]
# 保存图片
self.save_img(self.url+img,imgname)
# 获取日历信息
calendar=str(item.findNext('p')).replace('<p>','').replace('</p>','').replace('<br/>','').replace('<b>','\n').replace('</b>','\n')
calendar=calendar.split('\n')
# 保存到数据库
# 使用cursor()方法获取操作游标
cursor = self.db.cursor()
# SQL 插入语句
sql = """INSERT INTO calendar(calendar_img,
date, content, address)
VALUES ('%s', '%s', '%s', '%s')""" % (imgname,calendar[0],calendar[1],calendar[2])
print(sql)
try:
# 执行sql语句
cursor.execute(sql)
# 提交到数据库执行
self.db.commit()
except:
# 如果发生错误则回滚
self.db.rollback()
print('获取日历完成!')
def get_about(self):
print('正在获取艺人简介...')
# 获取 about html网页
res = requests.get(url="https://iamgem.com/about", headers=self.headers)
# 使用BeautifulSoup解析
bf = BeautifulSoup(res.text)
# 获取about图片
texts = bf.find_all('div', class_='col-sm-4 offset-sm-1 center')
# 获取图片地址
img = texts[0].findNext('img').get('src')
# 获取图片文件名
imgname = texts[0].findNext('img').get('src')
index = 0
if imgname.rfind('/') != -1:
index = imgname.rfind('/') + 1
imgname = imgname[index:]
# 保存图片到本地
self.save_img(self.url+img,imgname)
# 保存到数据库
# 使用cursor()方法获取操作游标
cursor = self.db.cursor()
cursor.execute("SELECT max(id) from about")
data=cursor.fetchone()
#获取id
id = 1
if data[0]!=None:
id=data[0]+1
# SQL 插入语句
sql = """INSERT INTO about(id,about_img)
VALUES (%d,'%s')""" % (id,imgname)
print(sql)
try:
# 执行sql语句
cursor.execute(sql)
# 提交到数据库执行
self.db.commit()
except:
# 如果发生错误则回滚
self.db.rollback()
# 获取about内容
p = bf.find_all('p')
# 获取标题
title = p[0].text.replace("<b>", "").replace("</b>", "")
# 获取内容
content = p[1].text.replace("<b>", "").replace("</b>", "")
# SQL 插入语句
sql = """INSERT INTO about_info(title,content,aid)
VALUES ("%s","%s",%d)""" % (title, content, id)
print(sql)
try:
# 执行sql语句
cursor.execute(sql)
# 提交到数据库执行
self.db.commit()
except:
# 如果发生错误则回滚
self.db.rollback()
# 获取标题
title = p[2].text.replace("<b>", "").replace("</b>", "")
# 获取内容
content = p[3].text.replace("<b>", "").replace("</b>", "")
# SQL 插入语句
sql = """INSERT INTO about_info(title,content,aid)
VALUES ('%s','%s',%d)""" % (title, content, id)
print(sql)
try:
# 执行sql语句
cursor.execute(sql)
# 提交到数据库执行
self.db.commit()
except:
# 如果发生错误则回滚
self.db.rollback()
def get_tour(self):
print('正在获取巡演信息...')
# 获取 about html网页
res = requests.get(url="https://iamgem.com/tour/", headers=self.headers)
# 使用BeautifulSoup解析
bf = BeautifulSoup(res.text)
# 获取巡演结果集
texts = bf.find_all('div', class_='col-12',style='height:100%;')
for item in texts:
div=item.findNext('div').find_all_next('div')
# 获取图片地址
img = item.findNext('img').get('src')
# 获取图片文件名
imgname = item.findNext('img').get('src')
index = 0
if imgname.rfind('/') != -1:
index = imgname.rfind('/') + 1
imgname = imgname[index:]
# 保存图片到本地
self.save_img(self.url + img, imgname)
# 获取标题
title = div[0].text.replace("<b>", "").replace("</b>", "")
# 保存到数据库
# 使用cursor()方法获取操作游标
cursor = self.db.cursor()
cursor.execute("SELECT max(id) from tour")
data = cursor.fetchone()
# 获取id
id = 1
if data[0] != None:
id = data[0] + 1
# SQL 插入语句
sql = """INSERT INTO tour(id,tour_img,title)
VALUES (%d,'%s','%s')""" % (id, imgname,title)
print(sql)
try:
# 执行sql语句
cursor.execute(sql)
# 提交到数据库执行
self.db.commit()
except:
# 如果发生错误则回滚
self.db.rollback()
#遍历巡回信息项
divdate=item.findNext('div').find_all_next('div',class_='col-4 col-sm-2')
addressdiv=item.findNext('div').find_all_next('div',class_="col-8 col-sm-4")
for i in range(0,len(divdate)):
# SQL 插入语句
sql = """INSERT INTO tour_info(date,address,tid)
VALUES ('%s','%s',%d)""" % (
divdate[i].text.replace("<b>", "").replace("</b>", ""), addressdiv[i].text.replace("<b>", "").replace("</b>", ""), id)
print(sql)
try:
# 执行sql语句
cursor.execute(sql)
# 提交到数据库执行
self.db.commit()
except:
# 如果发生错误则回滚
self.db.rollback()
print('获取巡演信息完成!')
def get_honour(self):
print('正在获取荣誉信息...')
# 获取 honour html网页
res = requests.get(url="https://iamgem.com/honour/", headers=self.headers)
# 使用BeautifulSoup解析
bf = BeautifulSoup(res.text)
#获取荣誉内容
content = bf.find_all('div', class_='col-sm-2 col-12 p-4 p-sm-1')
# 保存到数据库
# 使用cursor()方法获取操作游标
cursor = self.db.cursor()
cursor.execute("SELECT max(id) from honour")
data = cursor.fetchone()
# 获取id
id = 1
if data[0] != None:
id = data[0] + 1
# SQL 插入语句
sql = """INSERT INTO honour(id,content)
VALUES (%d,'%s')""" % (id, content[0].text.replace("<b>", "").replace("</b>", ""))
print(sql)
try:
# 执行sql语句
cursor.execute(sql)
# 提交到数据库执行
self.db.commit()
except:
# 如果发生错误则回滚
self.db.rollback()
# 获取荣誉结果集
texts = bf.find_all('div', class_='grid-item')
#遍历荣誉图片
for item in texts:
# 获取图片地址
img = item.findNext('img').get('src')
# 获取图片文件名
imgname = item.findNext('img').get('src')
index = 0
if imgname.rfind('/') != -1:
index = imgname.rfind('/') + 1
imgname = imgname[index:]
# 保存图片到本地
self.save_img(self.url + img, imgname)
# 保存到数据库
# 使用cursor()方法获取操作游标
cursor = self.db.cursor()
# SQL 插入语句
sql = """INSERT INTO honour_img(honour_img,hid)
VALUES ('%s',%d)""" % (imgname, id)
print(sql)
try:
# 执行sql语句
cursor.execute(sql)
# 提交到数据库执行
self.db.commit()
except:
# 如果发生错误则回滚
self.db.rollback()
print('获取荣誉信息完成!')
def get_awards(self):
print('正在获取奖项信息...')
# 获取 awards html网页
res = requests.get(url="https://iamgem.com/awards/", headers=self.headers)
# 使用BeautifulSoup解析
bf = BeautifulSoup(res.text)
# 获取奖项结果集
texts = bf.find_all('div', class_='tab-pane')
#遍历奖项信息
for item in texts:
#获取日期
date = item.get('id').replace('yr','')
#获取内容
content = item.text.replace("<b>", "").replace("</b>", "").replace("<p>", "").replace("</p>", "").replace("<h5>", "").replace("</h5>", "")
# 保存到数据库
# 使用cursor()方法获取操作游标
cursor = self.db.cursor()
# SQL 插入语句
sql = """INSERT INTO awards(year,content)
VALUES ('%s','%s')""" % (date, content)
print(sql)
try:
# 执行sql语句
cursor.execute(sql)
# 提交到数据库执行
self.db.commit()
except:
# 如果发生错误则回滚
self.db.rollback()
print('获取奖项信息完成!')
def get_charity(self):
print('正在获取公益信息...')
# 获取 charity html网页
res = requests.get(url="https://iamgem.com/charity/", headers=self.headers)
# 使用BeautifulSoup解析
bf = BeautifulSoup(res.text)
# 获取公益内容
content = bf.find_all('div', class_='col-sm-2 col-12 p-4 p-sm-1')
# 保存到数据库
# 使用cursor()方法获取操作游标
cursor = self.db.cursor()
cursor.execute("SELECT max(id) from charity")
data = cursor.fetchone()
# 获取id
id = 1
if data[0] != None:
id = data[0] + 1
# SQL 插入语句
sql = """INSERT INTO charity(id,content)
VALUES (%d,'%s')""" % (
id, content[0].text.replace("<b>", "").replace("</b>", ""))
print(sql)
try:
# 执行sql语句
cursor.execute(sql)
# 提交到数据库执行
self.db.commit()
except:
# 如果发生错误则回滚
self.db.rollback()
# 获取公益结果集
texts = bf.find_all('div', class_='grid-item')
# 遍历公益图片
for item in texts:
# 获取图片地址
img = item.findNext('img').get('src')
# 获取图片文件名
imgname = item.findNext('img').get('src')
index = 0
if imgname.rfind('/') != -1:
index = imgname.rfind('/') + 1
imgname = imgname[index:]
# 保存图片到本地
self.save_img(self.url + img, imgname)
# 保存到数据库
# 使用cursor()方法获取操作游标
cursor = self.db.cursor()
# SQL 插入语句
sql = """INSERT INTO charity_img(charity_img,cid)
VALUES ('%s',%d)""" % (imgname, id)
print(sql)
try:
# 执行sql语句
cursor.execute(sql)
# 提交到数据库执行
self.db.commit()
except:
# 如果发生错误则回滚
self.db.rollback()
print('获取公益信息完成!')
def save_img(self,url,name):
# 从网络上获取图片
with closing(requests.get(url=url, headers=self.headers,stream=True)) as r:
# 把图片保存到本地
with open(name, 'ab+') as f:
# 把图片写到文件
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
f.flush()
if __name__ == '__main__':
craw = crawler()
craw.init_db()
craw.get_charity()
craw.get_awards()
craw.get_honour()
craw.get_tour()
craw.get_about()
craw.get_calendar()