正则表达式爬取古诗文
# -*- coding:utf-8 -*-
import requests
import re
class Gushiwen_Spider(object):
def __init__(self):
self.url = "https://www.gushiwen.org/default_{}.aspx"
self.headers = {
"Referer":"https://www.gushiwen.org/gushi/",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
}
def get_response(self,url):
response = requests.get(url,headers=self.headers)
data = response.content.decode("utf-8")
# print(data)
return data
def parse_data(self,data):
titles = re.findall(r'<div class="cont">.*?<b>(.*?)</b>',data,re.DOTALL)
dynasties = re.findall(r'<p class="source"><a .*?>(.*?)</a>',data,re.DOTALL)
authors = re.findall(r'<p class="source"><a .*?>.*?<a .*?>(.*?)</a>',data,re.DOTALL)
content_tags = re.findall(r'<div class="contson" .*?>(.*?)</div>',data,re.DOTALL)
contents = []
for content in content_tags:
a = re.sub(r'<.*?>',"",content)
contents.append(a.split())
poems = []
for value in zip(titles,dynasties,authors,contents):
tittle,dynasty,author,content = value
poem = {
"tittle":tittle,
"dynasty": dynasty,
"author":author,
"content":content
}
poems.append(poem)
for poem in poems:
print(poem)
def main(self):
for i in range(101):
url = self.url.format(i)
data = self.get_response(url)
self.parse_data(data)
if __name__ == '__main__':
Gushiwen_Spider().main()