使用urllib抓取首页信息
抓取百度首页信息:
# 1.导入包(打开浏览器 )
from urllib import request
# 2. 准备网址
url = 'http://www.baidu.com'
# 3. 打开网址
response = request.urlopen(url)
# 4.解读一下,然后输出
print( response.read().decode('utf-8') )
正则表达式
# @File : 04.正则表达式.py
str2 = 'abcdefg'
# print('h' in str2)
# print( str2.count('h') )
str2 = 'abcdefg'
# 导入模块
import re
# 构建表达式的方法:语法 re.compile( str [,模式])
reg = re.compile('[0-9]')
print( re.search(reg,str2 ) )
names = '张三丰李四王五赵六张二张杰张飞'
reg1 = re.compile('张\w')
# re.findall(规则,被提取的字符串) 返回一个内容组成的list
print(re.findall(reg1, names))
scores = '''
[
['学号', '姓名', '语文', '数学', '英语'],
['1号', '张三: 50,70,60'],
['2号', '李四: 42,74,86'],
['3号', '王五: 50,84,90'],
['4号', '赵六: 20,73,40'],
['5号', '孙七: 70,60,45'],
['6号', '钱八: 80,40,90'],
]
'''
# num = re.compile('\d+')
num = re.compile('([\u4e00-\u9fa5]{2,3})[\u4e00-\u9fa5]')
result = re.findall(num,scores)
print(result)
# 贪婪模式
str3 = 'a123b 4566b 7八9b'
reg3 = re.compile('a.*b')
print( re.findall(reg3, str3) )
# 非贪婪模式
reg4 = re.compile('a.*?b')
print( re.findall(reg4, str3) )
使用正则提取51job职位信息
# 1 导入包
from urllib import request
# 2 准备链接
url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
# 3 打开url
response = request.urlopen(url)
# 4 读取并解码
html = response.read().decode('gbk')
# 5. 一般建议去网页找统一结构。
# 直接把结构复制过来,更改为 正则表达式.
# 怎么改?尽量保留统一的结构,标签 可以有相关属性
# 想要保存下来的有 (.*?)代替 被我们去掉 的用 .*? 代替
# 别忘了给 re.compile( ,re.S)
import re
reg = re.compile('<div class="el">.*?<p class="t1 ">.*?<a target="_blank" title="(.*?)".*?<span class="t2"><a target="_blank" title="(.*?)".*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*?<span class="t5">(.*?)</span>.*?</div>',re.S)
result = re.findall(reg,html)
from openpyxl import Workbook
wb = Workbook()
sheet = wb.active
sheet.title = '职位信息'
# 先增加首行信息
sheet.append('职位名,公司名,工作地点,薪资,发布时间'.split(','))
for each in result:
sheet.append(each)
wb.save('数据分析职位信息.xlsx')
requests获取网页内容
# 如果对着我的代码,导入的包是红色波浪线,
# 说明你需要去 pip install requests
# 1.导入模块
import requests
# 2. 准备网址
url = 'http://pic.netbian.com/e/search/result/?searchid=2252'
# url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
# 3. 访问,得到响应 response
# response.content 字节流字符串
# response.text 普通文本字符串
response = requests.get(url,
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
)
# 4. 需要设置网页编码
import chardet
# chardet.detect( 字节流字符串 ) 得到相关 编码信息
# 获取编码
code = chardet.detect(response.content)['encoding']
# 设置编码
response.encoding = code
# 网页文本str
html = response.text
from bs4 import BeautifulSoup
# 通过 response.text 得到 beautifulsoup对象
soup = BeautifulSoup(html,'html.parser')
# 先找有明显特征的 父元素
parent = soup.find(id='main')
# 找所有的 img
imgs = parent.find('ul').find_all('img')
# 提取每个img 的src
srcList = []
for each_img in imgs:
# 发现 src 都是 缺 了 http://...的一部分
src = each_img.attrs['src']
# 把拼接上 缺的部分,放到 srcList 中,一会用来下来
detailsrc = 'http://pic.netbian.com'+src
srcList.append(detailsrc)
print(srcList)
'''
下载的函数是
from urllib import request
request.urlretrieve(url地址,'保存到本地名字') 负责下载一次
'''
from urllib import request
# index 索引 ,each_src 具体的内容
# 对 图片地址列表进行 for ,下载每一个 地址
for index,each_src in enumerate(srcList):
request.urlretrieve(each_src ,f'{index+1}.jpg')