python爬取图片并保存到数据库_【原创源码】【python】小白爬数据,妹子资料、图片保存及数据写入数据库(MySQL)...

本帖最后由 jellyliao 于 2020-11-27 22:18 编辑

https://www.nvshens.org/ 这个网站可以看图、看文章,还有很多美女的资料,想想就营养不良...当然不可能,我是来资料的。

我是一个三分钟热度的人,要么想做就做,要么就不做,前几天想要爬取资料,就开始找资料,python大学的时候学了点基础,现在早就忘了,大学学的软件工程....现在嘛,因为没做研发岗的工作,基本也都废了,在网上找了些其他人写的代码,开始参考,遇到问题就查找,还好,问题基本得到解决。

勉勉强强写出了程序,请各位对不足予以指正,如果可以指导下编程,那就再好不过了。

下面是现在用的代码,还有点小问题可以优化,但是已经满足基本需求了(资料已经爬完,没有动力继续优化,好吧,还是懒...)

error_info用于报错信息,便于解决。

1、程序代码

'''

0.7 目标,1、优化代码;2、解决尚未预料到的小问题

进度:无限期暂停

'''

import requests

import pymysql

from bs4 import BeautifulSoup

import re

import time

def connect_mysql_db():

return pymysql.connect(host='127.0.0.1',#填自己mysql的地址

port=3306,

user='',#mysql帐号

password='',#mysql密码

database='数据库名称',

charset='utf8')

def error_info(id_,e_info):

error = connect_mysql_db()

error_ = error.cursor()

sql = "insert into error_info (code,info) values ('%s','%s')" %(id_,e_info)

#print("打印SQL:",sql)

error_.execute(sql)

error.commit()

error_.close()

def download_page(url):

data = requests.get(url).content

return data

def check_html(html,id_):#判断链接是否有数据

soup = BeautifulSoup(html, "html.parser")

try:

check_soup_ =soup.find ('div',attrs={'class':'box_entry_title'})

check_all_ = check_soup_.find_all ('div',attrs={'class':'hot_tag'})[0].text

check_rn_ = check_all_.replace('\n','')

check_rs_ = check_rn_.replace(' ','')

check_ = check_rs_.strip()

if check_ == '温情提示':

check = 'y'

return check

else:

check = 'n'

return check

except:

check_403_soup_ = soup.find('head')

check_403_title_ = check_403_soup_.find_all('title')[0].text

#n = n + 1

#print(check_403_title_,n)

if check_403_title_ == "403 - 禁止访问: 访问被拒绝。":

check = 'y'

#id_ = main().url_n_

error_info(id_,check_403_title_)

#print(check_403_title_,n)

return check

else:

check = 'n'

return check

def parse_html(html):

soup = BeautifulSoup(html, "html.parser")

name_ = image_ =id_ = point_ = detail_ = photo_ = ''

name_soup =soup.find('div',attrs={'class':'div_h1'})#名字

name_ = name_soup.find_all('h1')[0].text

print('名字:',name_)

image_id_soup =soup.find('div',attrs={'class':'infoleft_imgdiv'})#头像地址、编号

image_id_ = image_id_soup.find_all('a')[0]

image_ = image_id_['href']

photo_ = requests.get(image_).content

id_ = re.findall(r'girl/(.*)/',image_)[0]

print('头像:',image_)

print('编号:',id_)

with open('E:/pphoto/'+id_+'.jpg','wb') as file:

file.write(photo_)

point_soup =soup.find('div',attrs={'class':'score_div'})#评分

point_ = point_soup.find_all('span',attrs={'class':'score'})[0].text

print('评分:',point_)

nickname_ = age_ = birthday_ = star_ = blood_ = height_ = weight_ = bvalue_ = borntown_ = job_ = be_date_ = habit_ =''

property_list_soup = soup.find('table')#属性

if property_list_soup is not None:

for property_li in property_list_soup.find_all('tr'):

if property_li.find_all('th'):

continue

value_ = property_li.find_all('td',attrs={'class':'info_td'})[0].text#获取标签内文字

key_ = property_li.find_all('td')[1].text

if value_ == "别 名:":#别名

nickname_ = key_

print("别名:",nickname_)

elif value_ == "年 龄:":#年龄

age_ = key_

print("年龄:",age_)

elif value_ == "生 日:":#生日

birthday_ =key_

print("生日:",birthday_)

elif value_ == "星 座:":#星座

star_ =key_

print("星座:",star_)

elif value_ == "血 型:":#血型

blood_ =key_

print("血型:",blood_)

elif value_ == "身 高:":#身高

height_ =key_

print("身高:",height_)

elif value_ == "体 重:":#体重

weight_ =key_

print("体重:",weight_)

elif value_ == "三 围:":#三围

bvalue_ =key_

print("三围:",bvalue_)

elif value_ == "出 生:":#出生(地)

borntown_ =key_

print("出生:",borntown_)

elif value_ == "职 业:":#职业

job_ =key_

print("职业:",job_)

elif value_ == "出 道:":#出道时间

be_date_ =key_

print("出道:",be_date_)

elif value_ == "兴 趣:":#兴趣

habit_ =key_

print("兴趣:",habit_)

else :

print("无对应信息:",value_,key_)

error_info(id_,"无对应信息:"+value_)

detail_soup =soup.find('div',attrs={'class':'box_entry_title'})#详细资料

#detail_ = detail_soup.find_all('p')[0].text#有的页面无p标签,直接写在了div内

#print('详细资料:',detail_)

try:

detail_ = str(detail_soup.find_all('p')[0].text).replace('\'','\'\'')

print('详细资料:',detail_)

except:

detail_ = str(detail_soup.find_all('div',attrs={'class':'infocontent'})[0].text).replace('\'','\'\'')#有的页面无p标签,直接写在了div内

print('详细资料:',detail_)

photo_soup =soup.find('div',attrs={'class':'post_entry'})#套图信息,仅列表,没想要获取图片

print("套图列表(部分):")

try:

if photo_soup is not None:

for photo_list in photo_soup.find_all('li'):

photo_ = photo_list.find_all('a',attrs={'class':'caption'})[0].text

print(photo_)

else:

print("未收集到套图资料")

except :

print("标签读取有误,待跟进")#部分情况下,打印出不出来,未再发现此问题

db = connect_mysql_db()

cursor = db.cursor()

try:

sql = "insert into personinfomain (code,name,nickname,age,birthday,star,blood,height,weight,body_info,birthtown,job,be_date,habit,detail_info,point) values ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" %(id_,name_,nickname_,age_,birthday_,star_,blood_,height_,weight_,bvalue_,borntown_,job_,be_date_,habit_,detail_,point_)

#print("打印SQL:",sql)

cursor.execute(sql)

#print("完成01")

db.commit()

print("数据插入成功")

except:

print("数据插入失败")

error_info(id_,"数据插入失败")

def main():

n = 10000 #10000以下无数据

m = 10100

while m <30000:#30000以上无数据

while n <= m:

url_t_ = 'https://www.nvshens.org/girl/'

url_n_ = str(n)

url_all_ = url_t_ + url_n_ + '/'

check_ = check_html(download_page(url_all_),url_n_)

if check_ == 'n':

parse_html(download_page(url_all_))

else:

print(url_n_,"该链接编号无效!")

n = n + 1

print('编号1-',n-1,'已打印完成!')

time.sleep(16)

m = m + 100

main()

2、建表SQL,表的字段可按需增减

DROP TABLE IF EXISTS `personinfomain`;

CREATE TABLE `personinfomain` (

`id` int(12) UNSIGNED ZEROFILL NOT NULL AUTO_INCREMENT COMMENT '表序号',

`code` varchar(12) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '人物id',

`name` varchar(60) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '姓名',

`nickname` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '别名、昵称',

`age` varchar(12) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '年龄',

`sex` varchar(6) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT '女' COMMENT '性别',

`blood` varchar(6) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '血型',

`birthday` varchar(20) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '出生年月日',

`birthtown` varchar(40) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '出生地',

`job` varchar(40) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '职业:现主要职业,可再单独建表,确立职业类别',

`star` varchar(4) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '星座:1-12,初步确定性格,不可靠',

`height` varchar(6) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '身高:厘米',

`weight` varchar(12) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '体重:KG',

`body_info` varchar(20) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '三围:例如B95 W64 H95,单独建表,确立数据意义',

`cup` varchar(10) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT 'Cup:胸围',

`be_date` varchar(20) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '出道时间',

`habit` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '兴趣:如动漫、唱歌、游戏、文艺等',

`detail_info` varchar(1000) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '详细资料:详细描述人物',

`point` varchar(6) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '评分:从身材、样貌等多方面进行',

`news` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '状态:描述近期新闻',

PRIMARY KEY (`id`) USING BTREE

) ENGINE = InnoDB AUTO_INCREMENT = 110280 CHARACTER SET = latin1 COLLATE = latin1_swedish_ci COMMENT = '人物信息主表,待后续拆分及优化' ROW_FORMAT = Dynamic;

SET FOREIGN_KEY_CHECKS = 1;

DROP TABLE IF EXISTS `error_info`;

CREATE TABLE `error_info` (

`id` int(6) NOT NULL AUTO_INCREMENT COMMENT 'id,主键',

`code` varchar(12) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '信息编码,一般为6位',

`info` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '报错信息',

`do_end` varchar(40) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '处理结果',

PRIMARY KEY (`id`) USING BTREE

) ENGINE = InnoDB AUTO_INCREMENT = 100142 CHARACTER SET = latin1 COLLATE = latin1_swedish_ci ROW_FORMAT = Dynamic;

SET FOREIGN_KEY_CHECKS = 1;

3、程序运行截图

微信截图_20201127214933.png (36.26 KB, 下载次数: 0)

error_info表结构

2020-11-27 22:01 上传

微信截图_20201127215016.png (95.35 KB, 下载次数: 0)

personinfomain表结构

2020-11-27 22:01 上传

微信截图_20201127220026.png (422.85 KB, 下载次数: 0)

程序运行截图

2020-11-27 22:01 上传

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值