爬取csdn中子连接

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import urllib.request
#import time
'''
import pandas as pds数据处理包
import numpy as np
'''


def test(rooturl):
    #rooturl = 'https://blog.csdn.net'
    #options = webdriver.FirefoxOptions()
    #options.add_argument('-headless')
    #options.add_argument('--disable-gpu')
    #driver = webdriver.Firefox(firefox_options=options)
    #driver.get(rooturl)
    #time.sleep(50)
    #driver.stop_client()
    #ht = driver.page_source

    headers = {'Accept': 'text/html, application/xhtml+xml, image/jxr, */*',
               'Accept - Encoding': 'gzip, deflate',
               'Accept-Language': 'zh-Hans-CN, zh-Hans; q=0.5',
               'Connection': 'Keep-Alive',
               'Host': 'blog.csdn.net',
               'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063'}
    page = requests.get(rooturl, headers=headers)
    soup = BeautifulSoup(page.text, "lxml")
    #soup = BeautifulSoup(ht, "lxml")
    st = set()
    for link in soup.find_all('a'):
        if 'href' in link.attrs:
            href = link.attrs['href']
            if str(href).startswith("//"):
                href = "https:"+str(href)
            if str(href).startswith("/"):
                href = "https://blog.csdn.net"+str(href)
            print(href)
            st.add(href)
    return st
    """
    for link in soup.find_all('a'):
            print(link.get("href"))
    """


def main():
    href = test('https://blog.csdn.net')
    i = 0
    while len(href) > 0:
            test(href)
            ++i


if __name__ == '__main__':
    main()

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值