# -*- coding: UTF-8 -*-
# @Time: 2021/6/3 10:19
# @Author: wuling
# @Version: V1.0
# @File: mytest.py
# @desc: 练习爬虫
from bs4 import BeautifulSoup # 网页解析,获取数据
import re # 正则表达式,进行文字匹配`
import urllib.request, urllib.error # 制定URL,获取网页数据
import xlwt # 进行excel操作
#import sqlite3 # 进行SQLite数据库操作
findLink = re.compile(r'<a href="/python/python.*?" target="_top" title="Python .*?".*?')
# findLink = re.compile(r'<a href="(.*?)">')
url = "https://www.runoob.com/python/python-tutorial.html"
head = {"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Mobile Safari/537.36"} # 模拟浏览器 用户代理
request = urllib.request.Request(url, headers=head)
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8') # 网页源码
soup1 = BeautifulSoup(html, 'html.parser')
# soup2=BeautifulSoup(html, '1xml')
# find = soup1.find('div')
find = soup1.find_all("a")
data = []
for i in find:
result=re.findall(findLink, str(i))
if result:
res=result[0].split('=')[-1]
data.append(res)
# find = soup1.find(href="/python/python")
for i in data:
print("***************这是第%s课,课程叫%s****************"%(data.index(i),i))
# print("find's return type is ", type(find)) # 输出返回值类型
# print("find's content is", find) # 输出find获取的值
# print("find's Tag Name is ", find.name) # 输出标签的名字
# print("find's Attribute(class) is ", find['class']) # 输出标签的class属性值
爬出来的结果数据见下图: