如何使用python爬虫自动的从网站上爬取想要的数据

以下代码为实现从电商网站上自动的获取书籍信息

from selenium import webdriver
import time
from bs4 import BeautifulSoup
base_url = "https://www.jd.com/?cu=true&utm_source=baidu-pinzhuan&utm_medium=cpc&utm_campaign=t_288551095_baidupinzhuan&utm_term=0f3d30c8dba7459bb52f2eb5eba8ac7d_0_e98375277f714ddabd4e98712adec3a9"
with open('data1.txt', 'w', encoding='utf-8') as f:
    n = 1
    driver = webdriver.Chrome(executable_path='d:\\chromedriver.exe')
    driver.get(base_url)

    driver.find_element_by_id('key').send_keys('大数据')
    time.sleep(3)
    driver.find_element_by_class_name('button').click()
    while True:
        scroll = 1000
        for i in range(15):
            # '$(window).scrollTop(str(scroll))'
            driver.execute_script('var q = document.documentElement.scrollTop={}'.format(scroll))
            scroll += 1000
            # time.sleep(2)
            driver.implicitly_wait(10)

        doc = BeautifulSoup(driver.page_source, 'html.parser')
        # print(doc)
        book_l = doc.select('.gl-warp > li')
        for book in book_l:
            s = ''
            print(n)
            # print(book)
            book_imge = book.select('.p-img > a > img')[0].get('src')
            if book_imge is None:
                book_imge = book.select('.p-img > a > img')[0].get('data-lazy-img')
                # print(book_imge)
            book_price = book.select('.p-price > strong')[0].getText()
            # print(book_price)
            book_name = book.select('.p-name > a >em')[0].getText()
            # print(book_name)
            book_com_num = book.select(".p-commit > strong")[0].getText()
            # print(book_com_num)
            try:
                book_press = book.select('.p-shopnum >a')[0].getText()
            except:
                book_press = '不详'
            s += book_name +'\t'+book_press+'\t' + book_imge + '\t' + book_price + '\t' + book_com_num + '\n'
            n += 1
            f.write(s)
        if doc.select('.disabled') != doc.select('pn-next'):
            driver.find_element_by_class_name('pn-next').click()
            time.sleep(2)
        else:
            break

 

  • 0
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值