#story.py
#coding=utf8
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
def download(url,retry_nums=2):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6726.400 QQBrowser/10.2.2265.400'}
try:
driver = webdriver.PhantomJS(executable_path='C:/YKP/software/phantomjs-2.1.1-windows/bin/phantomjs.exe')
driver.get(url)
html = driver.page_source
driver.quit()
# html = requests.get(url,headers=headers)
# html.encoding = 'utf-8'
except requests.HTTPError as e:
print('Download error'+e.__str__())
html = None
if retry_nums>0:
if hasattr(e,'code') and 500 <= e.code <=600:
return download(url,retry_nums-1)
return html
def get_links(url):
html = download(url)
soup = BeautifulSoup(html,'html.parser')
links = soup.find('div',{'class','main'}).findAll('a')
s_dict ={}
for link in links:
s_link = link['href']
s_name = link.text
if 'html' not in s_link:
continue
s_dict[s_name] = s_link
return s_dict
def get_content(url):
story_dict = get_links(url)
filename = 'C:/YKP/story/pf.txt'
story = open(filename, 'a+', encoding='utf-8')
for (story_name,story_url) in story_dict.items():
print(story_name)
html = download(story_url)
soup = BeautifulSoup(html, 'html.parser')
contents = soup.find('div',{'class','content'}).findAll('p')
story.write('##########'+story_name+'##############')
for content in contents:
if '下一章' in content.text:
break
story.write(content.text.strip())
story.write('\n')
story.close()
if __name__ == '__main__':
url = 'http://www.pingfandeshijie.net/'
#download(url)
get_content(url)
#get_links(url)