用Python爬取—灵武封神_第一章 死里逃生
import requests
import re
import json
from requests.exceptions import RequestException
def get_one_page(url):
try:
headers = {
'User_Agent': 'Mozilla/5.0 (Macintosh; Inter Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
response.encoding = 'utf8'
return response.text
return None
except RequestException:
return None
"""
try-except代码块;
如果try代码块中的代码运行起来没有问题,Python将跳过except代码块;
如果try代码块中的代码导致了错误,Python将查找这样的except代码块,并运行其中的代码,即其中指定的错误与引发的错误相同
"""
def parse_one_page(html):
pattern = re.compile('<title>(.*?)</title>.*?<div class="title_txtbox">(.*?)</div>.*?"acticleBody">.*?<p>(.*?)</div>', re.S)
items = re.findall(pattern, html)
for item in items:
yield{
'title': item[0],
'name': item[1],
'content': item[2]
}
def write_to_file(content):
"""写入文件"""
with open('lingwufengshen.txt', 'a', encoding='utf-8') as f:
print(type(json.dumps(content)))
f.write(json.dumps(content, ensure_ascii=False)+'\n')
def main():
url = 'http://book.zongheng.com/chapter/891033/58356325.html'
html = get_one_page(url)
for item in parse_one_page(html):
write_to_file(item)
if __name__ == '__main__':
main()