从零开始学习python-爬虫 7月14日

本文链接：https://blog.csdn.net/qq_42184699/article/details/81048270

Python

---小白121的记录笔记

爬取句子迷网站的句子

由于句子迷拥有反爬虫机制所以祭出神奇 webdriver

需要调用

webdriver
WebdriverWait
BeautifulSoup
sys
io
re

from selenium.webdriver.support.ui  import WebDriverWait
from material import headers
from selenium import webdriver
from bs4 import BeautifulSoup
import sys
import io
import re

Chrome = ('C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe')
browser = webdriver.Chrome(Chrome)
wait = WebDriverWait(browser, 2)
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')

sentence = []

class make_monsy():

    def crawl_source(self, url):
        print('going to crawl URL : %s'%url)
        try:
            browser.get(url)
            html = browser.page_source
            if html:
                return html
        except EOFError:
            return None


    def find_html(self, html):
        find_h = re.findall('" title="查看本句" class="xlistju">(.*?)</a>', html)
        find_n = re.findall('class="views-field-field-oriwriter-value">(.*?)</a>', html)

        for i in find_h:
            for i_n in find_n:
                pass
            sentence.append('\'%s\t%s\','%(i, i_n))
        return sentence

    def page_num(self, html):
        soup = BeautifulSoup(html, 'lxml')
        page = soup.select('#block-views-xqalbumpage-block_1 > div > div > div > div > div.item-list > ul > li.pager-next.last > a')
        next_page = re.findall('href="(.*?)"', '%s'%page[0])
        url = 'https://www.juzimi.com%s'%next_page[0]
        return url


    def write_sentence(self, sentence, path):
        try:
            for i in sentence:
                save = open('%s'%path, 'a')
                save.write('%s\n\n'%i)
                save.close()
        except UnicodeEncodeError:
            pass
        print('sentence saveing to %s'%path)

    def core(self, url):
        path = 'C:\\Users\\121812\\Desktop\\makemoney.txt'
        tmp = make_monsy()
        while True:
            try:
                tmp.write_sentence(tmp.find_html(tmp.crawl_source(url)), path)
                print(url)
                url = tmp.page_num(tmp.crawl_source(url))
            except IndexError:
                print('already not web ...')



if __name__ == '__main__':
    tmp = make_monsy()
    tmp.core('https://www.juzimi.com/album/3209903')