爬虫---实现爬取京东商品（手办）

最新推荐文章于 2021-06-09 21:26:44 发布

CourserLi

最新推荐文章于 2021-06-09 21:26:44 发布

阅读量509

点赞数

分类专栏： WebSpider---爬虫文章标签： selenium python

本文链接：https://blog.csdn.net/CourserLi/article/details/105229467

版权

WebSpider---爬虫专栏收录该内容

5 篇文章 0 订阅

订阅专栏

效果图：

在这里插入图片描述

源代码：

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import selenium.common.exceptions
import urllib
import json
import csv
import time

class JdSpider():
    def open_file(self):
        self.fm = input('请输入文件保存格式（txt、json、csv）：')
        while self.fm!='txt' and self.fm!='json' and self.fm!='csv':
            self.fm = input('输入错误，请重新输入文件保存格式（txt、json、csv）：')
        if self.fm=='txt' :
            self.fd = open('Jd.txt','w',encoding='utf-8')
        elif self.fm=='json' :
            self.fd = open('Jd.json','w',encoding='utf-8')
        elif self.fm=='csv' :
            self.fd = open('Jd.csv','w',encoding='utf-8',newline='')

    def open_browser(self):  #打开浏览器
        self.browser = webdriver.Chrome()  #打开Chrome
        self.browser.implicitly_wait(10)  #隐式等待10秒
        self.wait = WebDriverWait(self.browser,10)  #超出最长时间(10s),抛出异常

    def init_variable(self):
        self.data = zip()
        self.isLast = False

    def parse_page(self):  #解析页面
        try:
            skus = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'//li[@class="gl-item"]')))
            skus = [item.get_attribute('data-sku') for item in skus]  #先存储到 item 中
            links = ['https://item.jd.com/{sku}.html'.format(sku=item) for item in skus]
            prices = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'//div[@class="gl-i-wrap"]/div[2]/strong/i')))
            prices = [item.text for item in prices]
            names = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'//div[@class="gl-i-wrap"]/div[3]/a/em')))
            names = [item.text for item in names]
            comments = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'//div[@class="gl-i-wrap"]/div[4]/strong')))
            comments = [item.text for item in comments]
            self.data = zip(links,prices,names,comments)
        except selenium.common.exceptions.TimeoutException:  #异常超时
            print('parse_page: TimeoutException')
            self.parse_page()  #再来一遍
        except selenium.common.exceptions.StaleElementReferenceException:  #参考元素'过时'? 这个我不理解
            print('parse_page: StaleElementReferenceException')
            self.browser.refresh()  #总之刷新就对了

    def turn_page(self):
        try:
            self.wait.until(EC.element_to_be_clickable((By.XPATH,'//a[@class="pn-next"]'))).click()  #点击下一页
            time.sleep(1)
            self.browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")  #向下拖动网页
            time.sleep(2)
        except selenium.common.exceptions.NoSuchElementException:  #定位不到元素(即'下一页'没有了,已经爬完了)
            self.isLast = True
        except selenium.common.exceptions.TimeoutException:  #异常超时
            print('turn_page: TimeoutException')
            self.turn_page()  #再来一遍
        except selenium.common.exceptions.StaleElementReferenceException:  #参考元素'过时'? 这个我不理解
            print('turn_page: StaleElementReferenceException')
            self.browser.refresh()  #总之刷新就对了

    def write_to_file(self):
        if self.fm == 'txt':
            for item in self.data:
                self.fd.write('----------------------------------------\n')
                self.fd.write('link：' + str(item[0]) + '\n')
                self.fd.write('price：' + str(item[1]) + '\n')
                self.fd.write('name：' + str(item[2]) + '\n')
                self.fd.write('comment：' + str(item[3]) + '\n')
        if self.fm == 'json':
            temp = ('link','price','name','comment')
            for item in self.data:
                json.dump(dict(zip(temp,item)),self.fd,ensure_ascii=False)
        if self.fm == 'csv':
            writer = csv.writer(self.fd)
            for item in self.data:
                writer.writerow(item)

    def close_file(self):
        self.fd.close()

    def close_browser(self):
        self.browser.quit()

    def crawl(self):
        kw = input('请京东商品的名字：')
        self.open_file()
        self.open_browser()
        self.init_variable()
        print('开始爬取')
        self.browser.get('https://search.jd.com/Search?keyword='+ urllib.parse.quote(kw) +'&enc=utf-8')  #京东商品搜索
        time.sleep(1)
        self.browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")  #模拟滚动条滚动到底部
        time.sleep(2)
        count = 0
        while not self.isLast:  #看能否检测到'下一页',检测到就继续
            count += 1
            print('正在爬取第 ' + str(count) + ' 页......')
            self.parse_page()
            self.write_to_file()
            self.turn_page()
        self.close_file()
        self.close_browser()
        print('结束爬取')

if __name__ == '__main__':
    spider = JdSpider()  #赋值一个类
    spider.crawl()  #调用类的函数

讲解：

一、本代码相比以往很棒的一点就是函数不用带参数

从源代码中可以看到参数全用 self 代替，要做到很简单，只需要用一个 class 将所有的函数给包起来，只要在主函数中调用类的函数即可

二、selenium 库

可以看到源代码中导入了大量方法，但并不用了解这些，当模板用就行了

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import selenium.common.exceptions

下面列举出一些模拟浏览器的模板：

1、打开浏览器模板

def open_browser(self):
    self.browser = webdriver.Chrome()  #打开Chrome
    self.browser.implicitly_wait(10)  #隐式等待10秒
    self.wait = WebDriverWait(self.browser,10)  #超出最长时间(10s),抛出异常

2、解析页面模板

def parse_page(self):
    try:
    	# 二次存储数据
        a = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'--xpath--')))
        a = [item.get_attribute('data-sku') for item in a]  #先存储到 item 中
        a2 = ['--URL{a1}--'.format(a1=item) for item in a]
        
        # 普通存储数据
        b = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'--xpath--')))
        b = [item.text for item in b]
        
        self.data = zip(a,b)
        
    # 抛出异常
    except selenium.common.exceptions.TimeoutException:  #异常超时
        print('parse_page: TimeoutException')
        self.parse_page()  #调用自身
    except selenium.common.exceptions.StaleElementReferenceException:  #参考元素'过时'
        print('parse_page: StaleElementReferenceException')
        self.browser.refresh()  #刷新

语句 try...expect... 是异常处理方法，当满足 expect 跟着的内容才会执行下面语句

3、其他内容

# 点击某个内容
self.wait.until(EC.element_to_be_clickable((By.XPATH,'--xpath--'))).click()
# 向下拖动网页
self.browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")

# 定位不到元素,常用于检测异常,常用于 try expect 语句中
selenium.common.exceptions.NoSuchElementException:

4、关闭浏览器

def close_browser(self):
    self.browser.quit()

参考代码：
selenium的常见异常
 webdriver中的等待——主要讲解WebDriverWait()
爬虫系列(十二) selenium的基本使用
 爬虫系列(十三) 用selenium爬取京东商品

CourserLi

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录