import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook
import time
def getHTMLText(url):
try:
r = requests.get(url, timeout = 30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return "产生异常"
def infoWrite(sheet,soup,count): #共同代码提取出来
orgn = soup.find('div',{'class':'orgn'}).text
fund = soup.find('label',{'id':'catalog_FUND'})
if fund==None: #判断是否为空
fund = ""
else:
fund = fund.parent.text
fund = fund.replace(' ','') #去除字符串信息的无用空格
fund = fund.replace('\n','')
keywd = soup.find('label',{'id':'catalog_KEYWORD'})
if keywd==None: #判断是否为空
keywd=""
else:
keywd = keywd.parent.text
keywd = keywd.replace(' ','') #去除字符串信息的无用空格
keywd = keywd.replace('\n','')
print(keywd)
sheet.cell(row=count,column=4).value=orgn
sheet.cell(row=count,column=5).value=fund
sheet.cell(row=count,column=6).value=keywd
def getJournalInfos(start_url,end_url,sheet,count,book,path): #爬取1994-2001年
for i in range(1994,2002):
if i in [1996,1997,1998]: #判断是否为96-98年,因为网页格式有变化
for j in range(1,5): #1994-2001年只有4个月
month = '0'+str(j) if len(str(j))==1 else str(j) #形成01、02这种格式数据
for k in range(19):
num = '.00'+str(k) if len(str(k))==1 else '.0'+str(k) #形成01、02这种格式数据