import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import urllib.request
#import time
'''
import pandas as pds数据处理包
import numpy as np
'''
def test(rooturl):
#rooturl = 'https://blog.csdn.net'
#options = webdriver.FirefoxOptions()
#options.add_argument('-headless')
#options.add_argument('--disable-gpu')
#driver = webdriver.Firefox(firefox_options=options)
#driver.get(rooturl)
#time.sleep(50)
#driver.stop_client()
#ht = driver.page_source
headers = {'Accept': 'text/html, application/xhtml+xml, image/jxr, */*',
'Accept - Encoding': 'gzip, deflate',
'Accept-Language': 'zh-Hans-CN, zh-Hans; q=0.5',
'Connection': 'Keep-Alive',
'Host': 'blog.csdn.net',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063'}
page = requests.get(rooturl, headers=headers)
soup = BeautifulSoup(page.text, "lxml")
#soup = BeautifulSoup(ht, "lxml")
st = set()
for link in soup.find_all('a'):
if 'href' in link.attrs:
href = link.attrs['href']
if str(href).startswith("//"):
href = "https:"+str(href)
if str(href).startswith("/"):
href = "https://blog.csdn.net"+str(href)
print(href)
st.add(href)
return st
"""
for link in soup.find_all('a'):
print(link.get("href"))
"""
def main():
href = test('https://blog.csdn.net')
i = 0
while len(href) > 0:
test(href)
++i
if __name__ == '__main__':
main()