从零开始学习python-爬虫 7月14日

Python

                                                                                                                     ---小白121的记录笔记

 

爬取 句子迷 网站的 句子

由于 句子迷 拥有反爬虫机制所以祭出神奇 webdriver

需要调用

 

  • webdriver
  • WebdriverWait
  • BeautifulSoup
  • sys
  • io
  • re

 

 

from selenium.webdriver.support.ui  import WebDriverWait
from material import headers
from selenium import webdriver
from bs4 import BeautifulSoup
import sys
import io
import re

Chrome = ('C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe')
browser = webdriver.Chrome(Chrome)
wait = WebDriverWait(browser, 2)
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')

sentence = []

class make_monsy():

    def crawl_source(self, url):
        print('going to crawl URL : %s'%url)
        try:
            browser.get(url)
            html = browser.page_source
            if html:
                return html
        except EOFError:
            return None


    def find_html(self, html):
        find_h = re.findall('" title="查看本句" class="xlistju">(.*?)</a>', html)
        find_n = re.findall('class="views-field-field-oriwriter-value">(.*?)</a>', html)

        for i in find_h:
            for i_n in find_n:
                pass
            sentence.append('\'%s\t%s\','%(i, i_n))
        return sentence

    def page_num(self, html):
        soup = BeautifulSoup(html, 'lxml')
        page = soup.select('#block-views-xqalbumpage-block_1 > div > div > div > div > div.item-list > ul > li.pager-next.last > a')
        next_page = re.findall('href="(.*?)"', '%s'%page[0])
        url = 'https://www.juzimi.com%s'%next_page[0]
        return url


    def write_sentence(self, sentence, path):
        try:
            for i in sentence:
                save = open('%s'%path, 'a')
                save.write('%s\n\n'%i)
                save.close()
        except UnicodeEncodeError:
            pass
        print('sentence saveing to %s'%path)

    def core(self, url):
        path = 'C:\\Users\\121812\\Desktop\\makemoney.txt'
        tmp = make_monsy()
        while True:
            try:
                tmp.write_sentence(tmp.find_html(tmp.crawl_source(url)), path)
                print(url)
                url = tmp.page_num(tmp.crawl_source(url))
            except IndexError:
                print('already not web ...')



if __name__ == '__main__':
    tmp = make_monsy()
    tmp.core('https://www.juzimi.com/album/3209903')

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值