内容
根据自己输入的网址,去爬取网页的子链接和标题。
#子链接 标题+url
import re
import urllib
import requests
from bs4 import BeautifulSoup
import datetime
exist_url = [] # 存放已爬取的网页
ALL_URLS = []
urls = []
#获取title
def get_title(url):
try:
context = ssl.SSLContext()
page = urllib.request.urlopen(url=url,context=context)
except Exception as e:
print(e)
else:
html = page.read().decode('utf-8')
title = re.findall('<title>(.+)</title>', html)
return title
#判断url是否有效
def isValidURL(str):
# Regex to check valid URL
regex = ("((http|https)://)(www.)?" +
"[a-zA-Z0-9@:%._\\+~#?&//=]" +
"{2,256}\\.[a-z]" +
"{2,6}\\b([-a-zA-Z0-9@:%" +
"._\\+~#?&//=]*)")
# Compile the ReGex
p = re.compile(regex)
# If the string is empty
# return false
if (str == None):
return False
# Return if the string
# matched the ReGex
if (re.search(p, str)):
return True
else:
return False
#获取子链接
def load(url):
# 记录已经爬取过的url
exist_url.append(url)
all_urls=[]
# 获取网页内容
headers = {'Connection': 'close'}
if(isValidURL(url)):
try:
r = requests.get(url,verify=False)
if r.status_code == 200:
data = r.text
# 利用正则查找所有连接
link_list = re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')", data)
#去重 删除已经爬取过的连接
unique_list = list(set(link_list) - set(exist_url))
for url1 in unique_list:
newUrl = parse.urljoin(url, url1)
if (isValidURL(newUrl)):
all_urls.append(newUrl)
except:
print('错误连接: ')
pass
return all_urls
url='http://www.baidu.cn'
page_1=load(url) #获取指定url的子链接和标题
i=0
for u1 in page_1:
try:
s1 = get_title(u1)
title1=s1[0]
except:
title1="没有获取到title"
print({"ID":i,"time":datetime.datetime.strftime(datetime.datetime.now(),'%Y-%m-%d %H:%M:%S'),"title":title1,"url":u1})
i += 1