爬取网站的子链接和标题

最新推荐文章于 2023-05-24 01:59:25 发布

别偷我能量

最新推荐文章于 2023-05-24 01:59:25 发布

阅读量428

点赞数 1

分类专栏： python 文章标签： python 开发语言

本文链接：https://blog.csdn.net/mitongxue/article/details/125634740

版权

python 专栏收录该内容

7 篇文章 0 订阅

订阅专栏

内容

根据自己输入的网址，去爬取网页的子链接和标题。

#子链接 标题＋url
import re
import urllib
import requests
from bs4 import BeautifulSoup
import datetime

exist_url = []  # 存放已爬取的网页
ALL_URLS = []
urls = []

#获取title
def get_title(url):
    try:
        context = ssl.SSLContext()
        page = urllib.request.urlopen(url=url,context=context)
    except Exception as e:
        print(e)
    else:
        html = page.read().decode('utf-8')
        title = re.findall('<title>(.+)</title>', html)
        return title

#判断url是否有效
def isValidURL(str):
    # Regex to check valid URL
    regex = ("((http|https)://)(www.)?" +
             "[a-zA-Z0-9@:%._\\+~#?&//=]" +
             "{2,256}\\.[a-z]" +
             "{2,6}\\b([-a-zA-Z0-9@:%" +
             "._\\+~#?&//=]*)")

    # Compile the ReGex
    p = re.compile(regex)

    # If the string is empty
    # return false
    if (str == None):
        return False

    # Return if the string
    # matched the ReGex
    if (re.search(p, str)):
        return True
    else:
        return False

#获取子链接
def load(url):
    # 记录已经爬取过的url
    exist_url.append(url)
    all_urls=[]
    # 获取网页内容
    headers = {'Connection': 'close'}
    if(isValidURL(url)):
        try:
            r = requests.get(url,verify=False)
            if r.status_code == 200:
                data = r.text
                # 利用正则查找所有连接
                link_list = re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')", data)
                #去重 删除已经爬取过的连接
                unique_list = list(set(link_list) - set(exist_url))

                for url1 in unique_list:
                    newUrl = parse.urljoin(url, url1)
                    if (isValidURL(newUrl)):

                        all_urls.append(newUrl)

        except:
            print('错误连接: ')
            pass
        return all_urls

url='http://www.baidu.cn'
page_1=load(url) #获取指定url的子链接和标题
i=0
for u1 in page_1:
    try:
        s1 = get_title(u1)
        title1=s1[0]
    except:
        title1="没有获取到title"
    print({"ID":i,"time":datetime.datetime.strftime(datetime.datetime.now(),'%Y-%m-%d %H:%M:%S'),"title":title1,"url":u1})
    i += 1