python:bs4,xlwt,re,sqlite3模块的引入,其中,xlwt用来实现了一个九九乘法表,这四个模块放一起,可以实现简单的网页爬虫:将爬取到的有用信息放入excel表中,或者放入数据库了
一、首先,引入美味汤汁:from bs4 import BeautifulSoup
# coding=utf-8
# @author: yyh
# time on: 2020/10/27 20:54
from bs4 import BeautifulSoup
file = open('./baidu.html', 'rb')
html = file.read().decode('utf-8') #读取文档
bs = BeautifulSoup(html, 'html.parser') #指定解析器解析文档,获得文档对象
# print(bs.title)
# print(bs.head)
# print(bs.a)
# #1. Tag 标签
# print(bs.title.string)
# print(type(bs.title.string))
#
# #2.NavigableString 标签里的内容(字符串)
# print(bs.a.attrs)
#
# #3.BeautifulSoup表示整个文档
# print(bs)
#
# #4.Comment 是一个特殊的NavigableString,输出的内容不包含注释符号
########文档的搜索
#-------1.find_all--------
# a_list = bs.find_all("a")
# print(a_list)
# import re #引入正则表达式模块
# #正则表达式搜索:使用search()方法来匹配内容
# a_list = bs.find_all(re.compile("a"))
# for item in a_list:
# print(item)
#方法:反回含有"name"属性的标签 (可用于自定义)
# def name_is_exists(tag):
# return tag.has_attr("name")
#
# name_list = bs.find_all(name_is_exists)
# print(name_list)
#--------2.kwargs 参数
#例:打印id为head 的元素内容
# head_list = bs.find_all(id="head")
# for item in head_list:
# print(item)
# head_list = bs.find_all(class_="news-meta-item clearfix")
# #例:打印class=news-meta-item clearfix 的元素内容,class_,下划线不能掉
# for item in head_list:
# print(item)
#--------3.text 参数
# text_list = bs.find_all(text='hpyouyu') #一个内容
# print(text_list)
# text_list = bs.find_all(text=['hpyouyu','地图','新闻']) #多个内容
# print(text_list)
#--------4.limit 参数
# a_list = bs.find_all("a",limit=3)
# for item in a_list:
# print(item)
#--------5.css选择器
#t_list = bs.select('title') #通过标签来查找
#t_list = bs.select('.mnav') #通过类名来查找
#t_list = bs.select('#u1') #通过id来查找
#t_list = bs.select('a[class="mnav c-font-normal c-color-t"]') # 通过id来查找
t_list = bs.select('head > meta') #通过子标签来查找
for item in t_list:
print(item)
二、其次,引入xlwt:import xlwt
用于写数据到excel表中
一个九九乘法表的例子助于理解
#在excel表中输出九九乘法表
workbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('九九乘法表')
for i in range(1, 10):
for j in range(1, i + 1):
worksheet.write(i-1,j-1,'%d*%d=%d' % (j, i, j * i))
workbook.save('chengfabiao.xls')
运行结果:
三、最后,正则表达式的应用,引入re:import re
# 正则表达式
import re
# ----------1.用research()
# 创建模式对象
pat = re.compile('\d{3}') # 正则表达式
m = pat.search('123') # 被校验字符串
print(m)
# 没有模式对象
m = re.search('[A-Z]', 'ADadsBCKj') # 前面是正则表达式,后面是被检验字符串
print(m)
# ----------2.用findall()
print(re.findall('[A-Z]', 'ADadsBCKj'))
print(re.findall('[A-Z]+', 'ADadsBCKj'))
# ----------3.用sub()
print(re.sub('a', 'A', 'aaabbbaC')) # 用A替换字符串aaabbbaC中的所有a
###建议在正则表达式中,被比较的字符串前面加上r,防止字符转义
a = "\aab-\""
b = r"\aab-\""
print(a)
print(b)
四、补充sqlite3,import sqlite3
# coding=utf-8
# @author: yyh
# time on: 2020/10/28 23:37
import sqlite3
# 1.连接数据库
# conn = sqlite3.connect('test.db')#打开或创建数据库文件
# print('opened database successfully')
# 2.建表
# c = conn.cursor() #获取游标
# sql = '''
# create table employee
# (id int primary key not null,
# name text not null,
# age int not null,
# address char(50),
# salary real);
# '''
# c.execute(sql) #执行sql语句
# conn.commit() #提交数据库操作
# conn.close() #关闭数据库连接
# print('create table successfully')
# 3.插入数据
# conn = sqlite3.connect('test.db')
# c = conn.cursor()
# sql1= '''
# insert into employee(id,name,age,address,salary)
# values (1,'jack',20,'北京',10000)
# '''
#
# sql2 = '''
# insert into employee(id,name,age,address,salary)
# values (2,'Amy',21,'福建',8000)
# '''
# c.execute(sql1)
# c.execute(sql2)
# conn.commit()
# conn.close()
# print('insert successfully')
# 4.查询数据
conn = sqlite3.connect('test.db')
c = conn.cursor()
sql = "select * from employee"
cursor = c.execute(sql)
for row in cursor:
print('id=', row[0])
print('name=', row[1])
print('age=', row[2])
print('address=', row[3])
print('salary=', row[4])
print()