爬虫|巨潮资讯网上市公司年报爬取
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import requests
import os
import random
from PyPDF2 import PdfFileReader
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--start-maximized')
browser = webdriver.Chrome(options=chrome_options)
#os.makedirs('D:\\公司年报') #在D盘创建文件夹,若重复运行,注释此行
#解析网址
def get_html_content(url):
header = {
"User-Agent":"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Mobile Safari/537.36"
}
r = requests.get(url,headers = header)
if r.status_code == 200:
r.encoding = 'utf-8'
#print(r.content)
return r.content
else:
return None
#保存pdf
def report_save(url,pdf_name):
report = get_html_content(url)
path = "D:\