import re import requests headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"} """ <a class="title-content" href="https://www.baidu.com/s?cl=3&tn=baid%9D%A">世界杯八强出炉</a> """ def getUrl(url): html = getHtml(url) urlre = "<a .*href=\"(https?://.*?)\".*>" urllist = re.findall(urlre, html) return urllist def getHtml(url): response = requests.get(url, headers=headers).content.decode('utf-8', 'ignore') return response def vastSpider(depth): while len(urlList) > 0: url = urlList.pop() if urlDict[url] <= depth: print('\t\t\t\t' * urlDict[url], '这是第%d层,%s' % (urlDict[url], url)) sonurlList = getUrl(url) for newurl in sonurlList: if newurl not in urlDict: urlDict[newurl] = urlDict[url] + 1 urlList.append(newurl) if __name__ == '__main__': startUrl = 'https://www.baidu.com/s?wd=世界杯' urlList = [] urlList.append(startUrl) urlDict = {} urlDict[startUrl] = 1 vastSpider(4)
爬虫(20):深度爬取策略(2)
最新推荐文章于 2024-08-17 11:30:00 发布