python-递归爬虫爬取网站所有子链接
python - 爬虫递归抓取网站信息
实现思路:
- 抓取网站所有信息并保存 ;
- 是用正则表达式提取子链接;
- 递归爬取子链接,保存爬取html信息
# 导入模块
import requests
import re
from urllib import parse
exist_url = [] # 存放已爬取的网页
writeCount = 0
urls = []
def load(url):
# 检查该url是否爬过
# 爬取URL
global writeCount
req = requests.get(url)
html = req.content
# urls.append(url)
# 正则表达式提取子链接
con = re.findall(r'(?<=href=")[^\"]+[^index].htm', html.decode('utf-8'))
# 存入所有子链接
for x in range(0, len(con)):
url2 = con[x]
url1 = 'https://' # 解析网页中所有子URL
# 将链接拼接
newUrl = parse.urljoin(url1, url2)
urls.append(newUrl)
print(urls)
# 去掉已爬取的链接和重复链接
unique_list = list(set(urls) - set(exist_url))
print(unique_list)
# 将读取内容写入文件
with open('file/index' + str(writeCount) + '.txt', 'w', encoding='utf-8') as fp:
fp.write(req.text)
writeCount += 1
# 遍历所有子URL再次调用
for i in range(0, len(unique_list)):
load(unique_list[i])
# 调用爬虫函数
url = 'https://'
load(url)