话不多说直接上代码:
# 获取目标网站的大量相关网页链接 if __name__ == '__main__': import ClimbHealthSkyCom ClimbHealthSkyCom.main() # 对链接进行进一步处理,去重 def LinkDeduplication(): print('开始链接去重') try: fp = open('secondList.txt', 'r') linkList = [] line = fp.readline() while line: linkList.append(line) line = fp.readline() except: print('未找到相关文件') # 去重 dealLinkListResult = list(set(linkList)) dealLinkListResult.sort() # 保存去重后的链接 try: fp = open('dealLinkListResult.txt', 'a') except: print('创建dealLinkListResult.txt失败') for temp in dealLinkListResult: fp.write(temp) print('链接去重结束') LinkDeduplication()
# 爬取中国糖尿病网站中糖尿病相关信息链接 # author 王耀国 # time 2019年12月29日 ''' 经过分析,该网站的首页是动态加载内容,并且加载网页的url为http://www.health-sky.com/zhishi/page/? 目前为止不清楚网页的规模大小,也就是不知道?最大时多少 经过分析,首页右侧有几大分类,可以依据其相似格式对网站进行爬取 现在构造一个链接盒子listold[]存放已经爬取过的链接,目前时单线程爬取,以后考虑多线程,一边更新自建的目标网站链接库,一边爬取并解析网页 构造一个二级链接库 ''' import urllib.request from bs4 import BeautifulSoup import re import os # 第一步获取原网页 因为网页位动态加载所以需要分析抓取动态网页 经过分析找到了动态加载的链接。 def gethtml(url): print('正在获取网页') req = urllib.request.Request(url) response = urllib.request.urlopen(req) res = response.read().decode('utf-8') print('获取网页结束') return res # 利用Beautiful Soup解析网页 def ParsingWebpage(res): print('正在解析网页') # 暂存新的链接 secondList = [] # 将新获得的链接保存到文件中 if (os.path.exists('secondList.txt')): fp2 = open('secondList.txt', 'a') print("打开已存在的'secondList.txt'") else: fp2 = open('secondList.txt', 'w') print("创建新的'secondList.txt'") soup = BeautifulSoup(res, "lxml") # 获取网站的全部指定链接 for linkList in soup.find_all('a',href = re.compile(r"http://www.health-sky.com/.*?.html")): # print(linkList['href']) secondList.append(linkList['href'] + "\n") for temp in secondList: fp2.write(temp) fp2.close() print('解析网页结束') # 从此处开始爬取 def main(): print('进入main') # 参照首页右侧的链接大类分类爬取并保存 firstLink = [] # 原始的链接库 firstLink.append('http://www.health-sky.com/zhishi') firstLink.append('http://www.health-sky.com/bingyin') firstLink.append('http://www.health-sky.com/zhengzhuang') firstLink.append('http://www.health-sky.com/zhiliao') firstLink.append('http://www.health-sky.com/jiancha') firstLink.append('http://www.health-sky.com/zhenduan') firstLink.append('http://www.health-sky.com/bingfazheng') firstLink.append('http://www.health-sky.com/huli') firstLink.append('http://www.health-sky.com/yufang') firstLink.append('http://www.health-sky.com/yiyuan') firstLink.append('http://www.health-sky.com/yaowu') # #与之相匹配的正则表达式解析页面获取网页链接 # ListOfRe = [] # ListOfRe.append(' href = re.compile(r"http://www.health-sky.com/zhishi/.*?.html")') # ListOfRe.append(' href = re.compile(r"http://www.health-sky.com/bingyin/.*?.html")') # ListOfRe.append(' href = re.compile(r"http://www.health-sky.com/zhengzhuang/.*?.html")') # ListOfRe.append(' href = re.compile(r"http://www.health-sky.com/zhiliao/.*?.html")') # ListOfRe.append(' href = re.compile(r"http://www.health-sky.com/jiancha/.*?.html")') # ListOfRe.append(' href = re.compile(r"http://www.health-sky.com/zhenduan/.*?.html")') # ListOfRe.append(' href = re.compile(r"http://www.health-sky.com/bingfazheng/.*?.html")') # ListOfRe.append(' href = re.compile(r"http://www.health-sky.com/huli/.*?.html")') # ListOfRe.append(' href = re.compile(r"http://www.health-sky.com/yufang/.*?.html")') # ListOfRe.append(' href = re.compile(r"http://www.health-sky.com/yiyuan/.*?.html")') # ListOfRe.append(' href = re.compile(r"http://www.health-sky.com/yaowu/.*?.html")') listOld = [] # 存放已经爬过的链接库 # 判断自建的目标网站链接库是否存在 if (os.path.exists('oldList.txt')): fp = open("oldList.txt", 'a') # 追加新的链接 print("打开已存在的oldList.txt") else: fp = open("oldList.txt", 'w') print("创建新的oldList.txt") # 根据原始链接库循环爬取 for temp in firstLink: print(temp) i = 1 res = gethtml(temp) listOld.append(temp) while res: ParsingWebpage(res) # 解析网页 i += 1 urlTemp = temp + '/page/%d' % i print(urlTemp) try: res = gethtml(urlTemp) listOld.append(urlTemp) except: print('爬取%s结束!共爬取网页%d页' % (urlTemp, i - 1)) break for temp in listOld: fp.write(temp + "\n") fp.close() print('离开main')
# 爬取根据dealLinkListResult链接库爬取最终结果 from bs4 import BeautifulSoup import os if __name__ == '__main__': import ClimbHealthSkyCom def ClimbFinalResult(): print('进入ClimbFinalResult') #最终的链接库 linkList = [] linkList.append('Link/LinkOfCauseOfDiabetes.txt') linkList.append('Link/LinkOfDiabetesCare.txt') linkList.append('Link/LinkOfDiabetesCheck.txt') linkList.append('Link/LinkOfDiabetesComplications.txt') linkList.append('Link/LinkOfDiabetesDiagnosis.txt') linkList.append('Link/LinkOfDiabetesHospital.txt') linkList.append('Link/LinkOfDiabetesKnowledge.txt') linkList.append('Link/LinkOfDiabetesMedication.txt') linkList.append('Link/LinkOfDiabetesPrevention.txt') linkList.append('Link/LinkOfDiabetesSymptoms.txt') linkList.append('Link/LinkOfDiabetesTreatment.txt') #最终的文档库 contentList = [] contentList.append('content/CententOfCauseOfDiabetes.txt') contentList.append('content/CententOfDiabetesCare.txt') contentList.append('content/CententOfDiabetesCheck.txt') contentList.append('content/CententOfDiabetesComplications.txt') contentList.append('content/CententOfDiabetesDiagnosis.txt') contentList.append('content/CententOfDiabetesHospital.txt') contentList.append('content/CententOfDiabetesKnowledge.txt') contentList.append('content/CententOfDiabetesMedication.txt') contentList.append('content/CententOfDiabetesPrevention.txt') contentList.append('content/CententOfDiabetesSymptoms.txt') contentList.append('content/CententOfDiabetesTreatment.txt') # 打开链接库Link contenNumber = 0 for temp in linkList: print(temp) print(contenNumber) try: fp = open(temp, 'r') print('打开%s成功'%temp) except: print('打开%s失败'%temp) # 保存网页数据 if os.path.exists(contentList[contenNumber]): print(contentList[contenNumber]) fpContent = open(contentList[contenNumber], 'a', encoding="utf8") # 追加新的内容 print("打开已存在的%s" % contentList[contenNumber]) else: fpContent = open(contentList[contenNumber], 'w', encoding="utf8") print("创建新的%s" % contentList[contenNumber]) #获取网页内容 temp = fp.readline() while temp: print(temp) try: res = ClimbHealthSkyCom.gethtml(temp) #print(res) # 解析返回结果 soup = BeautifulSoup(res,'lxml') # 获取网页内容 for content in soup.find('h1'): fpContent.write(content.text) for content in soup.find_all('article'): fpContent.write(content.text) except: print('获取网页内容失败!') temp = fp.readline() fp.close() fpContent.close() contenNumber += 1 # 保持文档与链接的一致性 ClimbFinalResult()
注意最后的链接需要自己分类存放到相应的txt中,然后进行相应的网页最终内容提取