Python
---小白121的记录笔记
爬取 句子迷 网站的 句子
由于 句子迷 拥有反爬虫机制所以祭出神奇 webdriver
需要调用
- webdriver
- WebdriverWait
- BeautifulSoup
- sys
- io
- re
from selenium.webdriver.support.ui import WebDriverWait
from material import headers
from selenium import webdriver
from bs4 import BeautifulSoup
import sys
import io
import re
Chrome = ('C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe')
browser = webdriver.Chrome(Chrome)
wait = WebDriverWait(browser, 2)
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
sentence = []
class make_monsy():
def crawl_source(self, url):
print('going to crawl URL : %s'%url)
try:
browser.get(url)
html = browser.page_source
if html:
return html
except EOFError:
return None
def find_html(self, html):
find_h = re.findall('" title="查看本句" class="xlistju">(.*?)</a>', html)
find_n = re.findall('class="views-field-field-oriwriter-value">(.*?)</a>', html)
for i in find_h:
for i_n in find_n:
pass
sentence.append('\'%s\t%s\','%(i, i_n))
return sentence
def page_num(self, html):
soup = BeautifulSoup(html, 'lxml')
page = soup.select('#block-views-xqalbumpage-block_1 > div > div > div > div > div.item-list > ul > li.pager-next.last > a')
next_page = re.findall('href="(.*?)"', '%s'%page[0])
url = 'https://www.juzimi.com%s'%next_page[0]
return url
def write_sentence(self, sentence, path):
try:
for i in sentence:
save = open('%s'%path, 'a')
save.write('%s\n\n'%i)
save.close()
except UnicodeEncodeError:
pass
print('sentence saveing to %s'%path)
def core(self, url):
path = 'C:\\Users\\121812\\Desktop\\makemoney.txt'
tmp = make_monsy()
while True:
try:
tmp.write_sentence(tmp.find_html(tmp.crawl_source(url)), path)
print(url)
url = tmp.page_num(tmp.crawl_source(url))
except IndexError:
print('already not web ...')
if __name__ == '__main__':
tmp = make_monsy()
tmp.core('https://www.juzimi.com/album/3209903')