import requests
import re
def main():
url = 'https://www.gushiwen.org/default_1.aspx'
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"cookie": "__guid=190620973.2922957719409206000.1553671750334.9504; ASP.NET_SessionId=l1ugsscdi2zmj5ku1ggy0kkx; Hm_lvt_04660099568f561a75456483228a9516=1553671752,1553673935; monitor_count=4; Hm_lpvt_04660099568f561a75456483228a9516=1553673958",
"referer": "https://www.gushiwen.org/default_2.aspx",
}
res = requests.get(url, headers=headers)
text = res.text
# print(text)
titles = re.findall("(.*)",text)
#print('标题-',','.join(titles))
chaodai = re.findall(r'
.*?(.*?)',text,re.DOTALL)
#print('朝代-',','.join(chaodai))
author = re.findall(r'
.*?.*?.*?.*?.*?(.*?).*?
',text,re.DOTALL)#print('作者-',','.join(author))
contens = re.findall(r'
# for i in contens:
# print('内容-', i)
infos= []
# for i in range(0,len(titles)):
# val = {
# 'title':titles[i],
# 'chaodai':chaodai[i],
# 'author':author[i],
# 'contens':contens[i],
# }
# infos.append(val)
for i in zip(titles,chaodai,author,contens):
titles,chaodai,author,contens = i
val = {
'title':titles,
'chaodai':chaodai,
'author':author,
'contens':contens,
}
infos.append(val)
print(infos)
if __name__ == '__main__':
main()