爬取csdn中子连接_htmlunit 反扒-CSDN博客

本文链接：https://blog.csdn.net/ab84120966/article/details/86221264

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import urllib.request
#import time
'''
import pandas as pds数据处理包
import numpy as np
'''


def test(rooturl):
    #rooturl = 'https://blog.csdn.net'
    #options = webdriver.FirefoxOptions()
    #options.add_argument('-headless')
    #options.add_argument('--disable-gpu')
    #driver = webdriver.Firefox(firefox_options=options)
    #driver.get(rooturl)
    #time.sleep(50)
    #driver.stop_client()
    #ht = driver.page_source

    headers = {'Accept': 'text/html, application/xhtml+xml, image/jxr, */*',
               'Accept - Encoding': 'gzip, deflate',
               'Accept-Language': 'zh-Hans-CN, zh-Hans; q=0.5',
               'Connection': 'Keep-Alive',
               'Host': 'blog.csdn.net',
               'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063'}
    page = requests.get(rooturl, headers=headers)
    soup = BeautifulSoup(page.text, "lxml")
    #soup = BeautifulSoup(ht, "lxml")
    st = set()
    for link in soup.find_all('a'):
        if 'href' in link.attrs:
            href = link.attrs['href']
            if str(href).startswith("//"):
                href = "https:"+str(href)
            if str(href).startswith("/"):
                href = "https://blog.csdn.net"+str(href)
            print(href)
            st.add(href)
    return st
    """
    for link in soup.find_all('a'):
            print(link.get("href"))
    """


def main():
    href = test('https://blog.csdn.net')
    i = 0
    while len(href) > 0:
            test(href)
            ++i


if __name__ == '__main__':
    main()