python爬虫非常的好玩,比java代码简洁很多,xpath库对网页的爬取和数据提取支持的也非常好,安装xpath库很简单,这里不再细讲,直接看关键代码:
import time
import traceback
import requests
import pymysql.cursors
from lxml import etree
from Include.pyBean.WallPaperBean import WallPaperBean
# 该类任务是爬取这个网站的壁纸 http://ioswall.com/
IPHONEWALLS_URL = 'http://ioswall.com/category/{category_id}/page/{page}'
CATEGORY_INFO = {
'Original': ('original'),
'Love': ('love'),
'Flowers': ('flowers'),
'technology': ('Technology')
}
wallPaperBeanList = []
def get_data(href):
# 爬一波网页源码
try:
response = requests.get(href, timeout=10)
if response.status_code == 200:
html = etree.HTML(response.text)
# result = etree.tostring(html)
return html
else:return '[]'
except:
print(traceback.format_exc())
print('retry>>>')
try:
response = requests.get(href, timeout=30)
if response.status_code == 200:
html = etree.HTML(response.text)
# result = etree.tostring(html)
return html
else:return '[]'
except:
print('failure>href>')
print(href)
return '[]'
def savaDataToDateBase():
# 创建sql语句,并执行
create_tab_sql = "CREATE TABLE `wallpaper` (`id` INT(11) NOT NULL AUTO_INCREMENT,`category` VARCHAR(255) COLLATE utf8_bin NOT NULL,`view_img` VARCHAR(255) COLLATE utf8_bin NOT NULL,`img` VARCHAR(255) COLLATE utf8_bin NOT NULL,`created_time` VARCHAR(255) COLLATE utf8_bin ,`img_tag` VARCHAR(255) COLLATE utf8_bin ,PRIMARY KEY (`id`)) ENGINE=INNODB DEFAULT CHARSET=utf8 COLLATE=utf8_bin AUTO_INCREMENT=1 ;"
for ll in range(0, len(wallPaperBeanList)):
# 连接MySQL数据库
connection = pymysql.connect(host='127.0.0.1', port=3306, user='admin', password='admin', db='AllThingArePower',
charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
# 通过cursor创建游标
cursor = connection.cursor()
# insert_sql = "INSERT INTO 'wallpaper' ('category','view_img','img','created_time','img_tag') VALUES ("+ wallPaperBeanList[ll].category +','+wallPaperBeanList[ll].view_img +','+wallPaperBeanList[ll].img +','+wallPaperBeanList[ll].created_time +','+'null' +')'
# print('category==' + wallPaperBeanList[ll].category + ';view_img==' + str(
# wallPaperBeanList[ll].view_img) + ';img==' + str(wallPaperBeanList[ll].img) + ';created_time==' + str(wallPaperBeanList[ll].created_time) + ';img_tag==' + str(wallPaperBeanList[ll].img_tag))
# cursor.execute(insert_sql)
# 不要用 % 或者 + 操作符来拼接SQL语句,应该使用占位符。即execute的第二个参数。
# 插入数据操作
cursor.execute('insert into wallpaper (category,view_img,img,created_time,img_tag) values (%s,%s,%s,%s,%s)', (str(wallPaperBeanList[ll].category), str(
wallPaperBeanList[ll].view_img),str(wallPaperBeanList[ll].img),str(wallPaperBeanList[ll].created_time),str(wallPaperBeanList[ll].img_tag)))
# 提交SQL
connection.commit()
# 关闭数据连接
connection.close()
def auto_get_data():
for k, v in CATEGORY_INFO.items():
# for page in range(1, 2): # 测试时小批量爬取使用
for page in range(1, 1100): # 正式爬取的时候页数调大
url = IPHONEWALLS_URL.format(category_id=v, page=page)
response_data = get_data(url)
if response_data == '[]':
break
# print('response_data==' + str(etree.tostring(response_data)))
# 通过contains()方法,第一个参数传入属性名称,第二个参数传入属性值,只要此属性包含所传入的属性值,就可以完成匹配了。
imgUrls = response_data.xpath('//li//div//a/img[contains(@class, "attachment-post-thumbnail size-post-thumbnail wp-post-image")]/@src')
createTimes = response_data.xpath('//li//div//li/a/text()')
for nn in range(0,len(imgUrls)):
imgUrl = imgUrls[nn]
wallPaperBean = WallPaperBean(k, imgUrl,imgUrl,createTimes[nn],'','')
wallPaperBeanList.append(wallPaperBean)
# print('created_time==' + createTimes[nn])
# print('category==' + k)
# print('view_img==' + imgUrl)
# print('img==' + wallPaperBean.img)
print('现在的list中图片数量==' + str(len(wallPaperBeanList)))
time.sleep(1)
# 操作mysql数据做入库操作
savaDataToDateBase()
# 入库后查询库里的数据操作
def queryDataFromDB():
# 连接MySQL数据库
connection = pymysql.connect(host='127.0.0.1', port=3306, user='admin', password='admin', db='AllThingArePower',
charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
# 通过cursor创建游标
cursor = connection.cursor()
# 查询数据操作
result = cursor.execute('select category,view_img,created_time from wallpaper where id<%s',(10))
print("-----------华丽分割线------------")
print(result)
# for data in result:
# print(data)
# 提交SQL
connection.commit()
# 关闭数据连接
connection.close()
if __name__ == '__main__':
auto_get_data()
# queryDataFromDB()