python3实现网页爬虫并入库mysql

4 篇文章 1 订阅
2 篇文章 0 订阅

python爬虫非常的好玩,比java代码简洁很多,xpath库对网页的爬取和数据提取支持的也非常好,安装xpath库很简单,这里不再细讲,直接看关键代码:

import time
import traceback
import requests
import pymysql.cursors
from lxml import etree
from Include.pyBean.WallPaperBean import WallPaperBean
# 该类任务是爬取这个网站的壁纸 http://ioswall.com/ 

IPHONEWALLS_URL = 'http://ioswall.com/category/{category_id}/page/{page}'

CATEGORY_INFO = {
    'Original': ('original'),
    'Love': ('love'),
    'Flowers': ('flowers'),
    'technology': ('Technology')
}
wallPaperBeanList = []

def get_data(href):
    # 爬一波网页源码
    try:
            response = requests.get(href, timeout=10)
            if response.status_code == 200:
                html = etree.HTML(response.text)
                # result = etree.tostring(html)
                return html
            else:return '[]'
    except:
        print(traceback.format_exc())
        print('retry>>>')
        try:
            response = requests.get(href, timeout=30)
            if response.status_code == 200:
                html = etree.HTML(response.text)
                # result = etree.tostring(html)
                return html
            else:return '[]'
        except:
            print('failure>href>')
            print(href)
            return '[]'


def savaDataToDateBase():


    # 创建sql语句,并执行
    create_tab_sql = "CREATE TABLE `wallpaper` (`id` INT(11) NOT NULL AUTO_INCREMENT,`category` VARCHAR(255) COLLATE utf8_bin NOT NULL,`view_img` VARCHAR(255) COLLATE utf8_bin NOT NULL,`img` VARCHAR(255) COLLATE utf8_bin NOT NULL,`created_time` VARCHAR(255) COLLATE utf8_bin ,`img_tag` VARCHAR(255) COLLATE utf8_bin ,PRIMARY KEY (`id`)) ENGINE=INNODB DEFAULT CHARSET=utf8 COLLATE=utf8_bin AUTO_INCREMENT=1 ;"

    for ll in range(0, len(wallPaperBeanList)):
        # 连接MySQL数据库
        connection = pymysql.connect(host='127.0.0.1', port=3306, user='admin', password='admin', db='AllThingArePower',
                                     charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)

        # 通过cursor创建游标
        cursor = connection.cursor()
        # insert_sql = "INSERT INTO 'wallpaper' ('category','view_img','img','created_time','img_tag') VALUES ("+ wallPaperBeanList[ll].category +','+wallPaperBeanList[ll].view_img +','+wallPaperBeanList[ll].img +','+wallPaperBeanList[ll].created_time +','+'null' +')'

        # print('category==' + wallPaperBeanList[ll].category + ';view_img==' + str(
        #     wallPaperBeanList[ll].view_img) + ';img==' + str(wallPaperBeanList[ll].img) + ';created_time==' + str(wallPaperBeanList[ll].created_time) + ';img_tag==' + str(wallPaperBeanList[ll].img_tag))
        # cursor.execute(insert_sql)

        # 不要用 % 或者 + 操作符来拼接SQL语句,应该使用占位符。即execute的第二个参数。
        # 插入数据操作
        cursor.execute('insert into wallpaper (category,view_img,img,created_time,img_tag) values (%s,%s,%s,%s,%s)', (str(wallPaperBeanList[ll].category), str(
            wallPaperBeanList[ll].view_img),str(wallPaperBeanList[ll].img),str(wallPaperBeanList[ll].created_time),str(wallPaperBeanList[ll].img_tag)))

        # 提交SQL
        connection.commit()

        # 关闭数据连接
        connection.close()
def auto_get_data():


    for k, v in CATEGORY_INFO.items():
        # for page in range(1, 2):  # 测试时小批量爬取使用
        for page in range(1, 1100):    # 正式爬取的时候页数调大
            url = IPHONEWALLS_URL.format(category_id=v, page=page)
            response_data = get_data(url)

            if   response_data == '[]':
                break
            # print('response_data==' + str(etree.tostring(response_data)))
            # 通过contains()方法,第一个参数传入属性名称,第二个参数传入属性值,只要此属性包含所传入的属性值,就可以完成匹配了。
            imgUrls = response_data.xpath('//li//div//a/img[contains(@class, "attachment-post-thumbnail size-post-thumbnail wp-post-image")]/@src')
            createTimes =  response_data.xpath('//li//div//li/a/text()')

            for nn in range(0,len(imgUrls)):
                imgUrl = imgUrls[nn]
                wallPaperBean = WallPaperBean(k, imgUrl,imgUrl,createTimes[nn],'','')
                wallPaperBeanList.append(wallPaperBean)
                # print('created_time==' + createTimes[nn])
                # print('category==' + k)
                # print('view_img==' + imgUrl)
                # print('img==' + wallPaperBean.img)
                print('现在的list中图片数量==' + str(len(wallPaperBeanList)))
            time.sleep(1)
    # 操作mysql数据做入库操作
    savaDataToDateBase()

# 入库后查询库里的数据操作
def queryDataFromDB():
    # 连接MySQL数据库
    connection = pymysql.connect(host='127.0.0.1', port=3306, user='admin', password='admin', db='AllThingArePower',
                                 charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)

    # 通过cursor创建游标
    cursor = connection.cursor()
    # 查询数据操作
    result = cursor.execute('select category,view_img,created_time from wallpaper where id<%s',(10))
    print("-----------华丽分割线------------")
    print(result)
    # for data in result:
    #     print(data)
    # 提交SQL
    connection.commit()

    # 关闭数据连接
    connection.close()


if __name__ == '__main__':
     auto_get_data()
    # queryDataFromDB()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值