用requests实现京东手机(手机名称,手机价格,手机图片)信息的爬取
import requests
from lxml import etree
import time
import os
import pymysql
def jdphone_spider(url,beginPage,endPage):
database = pymysql.connect(host="127.0.0.1", user="root", passwd="960505", db="jdPhone", charset='utf8')
cursor = database.cursor()
for page in range(beginPage,endPage):
pn = page*2 - 1
print('正在抓取第{}页'.format(page))
full_url = url+'&page='+str(pn)
time.sleep(2)
load_page(database,cursor,full_url)
cursor.close()
database.close()
def load_page(database,cursor,url):
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36 Edg/86.0.622.691'
}
response = requests.get(url=url,headers=headers)
page_text = response.text
tree = etree.HTML(page_text)
li_list = tree.xpath('//*[@id="J_goodsList"]/ul/li')
if not os.path.exists('./jdPicLibs'):
os.mkdir('./jdPicLibs')
for li in li_list:
phone_price = li.xpath('./div/div[3]/strong//text()')
phone_price = phone_price[1]+phone_price[2]
detail_url = 'https:'+li.xpath('./div/div[1]/a/@href')[0]
detail_text = requests.get(url=detail_url,headers=headers).text
tree = etree.HTML(detail_text)
phone_name = tree.xpath('/html/body/div[6]/div/div[2]/div[1]/text()')
phone_name = ''.join(phone_name).strip()
print(phone_name)
img_url = 'https:'+tree.xpath('//div[@id="spec-n1"]/img/@data-origin')[0]
img_data = requests.get(url=img_url,headers=headers).content
save_phone_info(database,cursor,phone_name,phone_price,img_data)
def save_phone_info(database,cursor,phone_name,phone_price,phone_img):
try:
sql = "INSERT INTO phone VALUES (null,%s, %s, %s);"
args = (phone_name, phone_price, phone_img)
cursor.execute(sql, args)
database.commit()
except Exception as e:
print(e)
database.rollback()
if __name__ == '__main__':
beginPage = int(input('请输入起始页:'))
endPage = int(input('请输入结束页:'))
url = 'https://search.jd.com/Search?keyword=手机&enc=utf-8'
print('开始爬取......')
start_time = time.time()
jdphone_spider(url,beginPage,endPage)
end_time = time.time()
print('爬取完毕!!!\n'+(end_time-start_time))