爬虫学习(二)

数据解析

        数据解析分类:   1.正则   2.bs4   3.xpath

         数据解析原理概述:

                - 解析的局部的文本内容都会在标签之间或者标签对应的属性中进行存储

                - 进行指定标签的定位

                - 对标签或者标签对应的属性中存储的数据值进行提取(解析)

实际操作

        需求:请求url保存图片至本地

import requests


url = 'https://pics5.baidu.com/feed/b3b7d0a20cf431ad67f62bea911223aa2fdd98e3.jpeg'
#content返回的是二进制形式的图片数据
image_data = requests.get(url=url).content
with open("./zz.jpeg",'wb') as fp:
    fp.write(image_data)

        需求:爬取指定url的图片,这里要将Cookie改为自己的,不然爬取不了

import requests
import re
import os

#创建一个文件夹,保存所有的图片
if not os.path.exists('./image'):
    os.mkdir('./image')
#请求头
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0'
           ,'Cookie':'BAIDUID=15792F369CCB57C7B1768AB47021693D:FG=1; BIDUPSID=C5F74397E4C3533B468A6D7F70753C45; PSTM=1709637600; H_PS_PSSID=40212_40080_40365_40352_40375_40366_40401_40464_40459_40317_39661_40510; H_WISE_SIDS=40212_40080_40365_40352_40375_40366_40401_40464_40459_40317_39661_40510; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; ab_sr=1.0.1_YTc4OTcyYWQ4NzdkMjkxMjAyZjc1NzM1YmY3MDdlOWM0YjM3ZTUzOGVlNWVjMjIzZjEwN2Q5NGExOWM5YjAxMDA1ZWFmYmJiNGNkYzQ4YTI4OTFiNjU0Y2E4OWE3ZDBlODFmZjg4M2JjZjFhOTA5YjAwNTFmNGVhMmM3NWViMWViZWEzZGIzNzVhNDEyZWIwOTAzMGYwYTg4NjRjOTlhMg==; BA_HECTOR=20a1858l808485242l252424f05h9o1ivqpa91s; ZFY=intePhgjae:BoGLsKPKIab2U4XLe4NjsR:B15bTm:BBmQE:C; RT="z=1&dm=baidu.com&si=d57b3fea-11cb-4cc7-9cb6-664a568860fe&ss=lu2mdpll&sl=1&tt=1ec&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&ld=26p&ul=2pyf&hd=2pz6'
            ,'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2'

            ,'Referer':'https://www.baidu.com/link?url=glpn33iXU_5YSodvA3laXaAJ9gntlMpxcDahjF8rt-4UdB3RPqqEbDSVm4Y4uEuXcKazstR2DjF_mH5ILBv39sBvPwHzv2yVrUGRNXyAkqa&wd=&eqid=e4289b1d0029ac530000000665fd39e3'
           ,'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'
           }
#要爬取的页面
url = 'https://baijiahao.baidu.com/s?id=1646606332727653670&wfr=spider&for=pc'

#使用通用爬虫对url对应的一整张页面进行爬取,去除空格以及换行符
url_text = requests.get(url=url,headers=headers).text.replace(' ','').replace('\n','')
#正则表达式
ex = '{"type":"img","link":"(.*?)@f_auto\?token=.*?"}'
img_url = re.findall(ex,url_text)
url_list = []
print(img_url)
#去重
for item in img_url:
    if item not in url_list:
        url_list.append(item)
#将不能利用的去除
del url_list[-10:]

for item in url_list:
    #生成图片名称
    img_name = item.split('/')[-1]
    #生成路径
    img_path = './image/' + img_name
    img_data = requests.get(url=item,headers=headers).content
    with open(img_path,'wb') as fp:
        fp.write(img_data)
        print(img_name + ' 下载成功!')

bs4数据解析的原理:

        1.实例化一个BeautifulSoup对象,并且将页面源码数据加载到该对象中

        2.通过调用BeautifulSoup对象中相关的属性或者方法进行标签定位和数据提取

        bs4基本使用

from bs4 import BeautifulSoup
import lxml
#将本地的html文档中的数据加载到该对象中
#fp = open('./test.html','r',encoding='utf-8')
#soup = BeautifulSoup(fp,'lxml')
#print(soup)
#将互联网上获取的页面源码加载到该对象中
#page_text = response.text
#soup = Beautiful(page_text,'lxml')

fp = open('./test.html','r',encoding='utf-8')
soup = BeautifulSoup(fp,'lxml')
#soup.tagName返回的是html中第一次出现的tagName标签
print(soup.a)
#print(soup.li)
#find('tagName') 等同于soup.tagName
print(soup.find('a'))
#返回符合要求的所有标签
print(soup.findAll('a'))
#select('某种选择器(id,class,标签...选择器)'),返回的是一个列表
print(soup.select('.more'))
# >表示的是一个层级, ' '(空格)表示多个层级
print(soup.select('.s1 > dd > ul > li > a')[0])
print(soup.select('.s1 > dd  a')[0])

#获取标签中文本数据
#text/get_text()可以获取某一个标签中所有的文本内容,string只可以获取该标签下面直系的文本内容
print(soup.select('.s1 > dd  a')[0].text)
#print(soup.select('.s1 > dd  a')[0].string)
#print(soup.select('.s1 > dd  a')[0].get_text())

#获取标签中的属性
print(soup.select('.s1 > dd  a')[0]['href'])

需求:爬取指定图书网站三国演义的所有章节标题与内容

from bs4 import BeautifulSoup
import lxml
import requests


url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0'

}
page_text = requests.get(url=url,headers=headers).content

#实例化Beautiful对象,需要将页面源码数据加载到该对象中
soup = BeautifulSoup(page_text,'lxml')
#解析标题和详情页的url
li_list = soup.select('.book-mulu > ul > li ')
title_list = []
content_list = []
#写入文本文件中
fp = open('./sanguo.txt','w',encoding='utf-8')
for li in li_list:
    url_s = 'https://www.shicimingju.com' + li.a['href']
    detail_page_text = requests.get(url=url_s,headers=headers).content
    detail_soup = BeautifulSoup(detail_page_text,'lxml')
    content = detail_soup.select('.chapter_content ')[0].get_text()
    title = li.a.string
    title_list.append(title)
    fp.write(title +  ':' + content + '\n')
    print(title + '爬取成功!')
print(title_list)

需求:爬取58同城二手房的标题和价格

#需求:爬取58二手房的房源信息
from lxml import etree
import requests

url = 'https://bj.58.com/ershoufang/'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0'
    ,
}
page_text = requests.get(url=url,headers=headers).text

#数据解析
tree = etree.HTML(page_text)
fp = open('./ershoufang.txt','w',encoding='utf-8')
title_list = tree.xpath('//div[@class="property-content-title"]//h3[@class="property-content-title-name"]/text()')
price_list = tree.xpath('//div[@class="property-price"]//span[@class="property-price-total-num"]/text()')

print(title_list)
print(price_list)
for i in range(len(title_list)):
   fp.write(title_list[i] + '- price: '+ price_list[i] + '\n')

需求:爬取指定网站动漫人物所有的图片,包括1到136页

import requests
from lxml import etree
import os

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0'
    ,
}
url = 'https://pic.netbian.com/4kdongman/index.html'
response = requests.get(url=url,headers=headers)
response.encoding = 'gbk'
page_text = response.text
tree = etree.HTML(page_text)
url_list = tree.xpath('//div[@class="slist"]/ul[@class="clearfix"]//img/@src')
print(url_list)
if not os.path.exists('./img'):
    os.mkdir('./img')
for i in range(len(url_list)):
    url_path = 'https://pic.netbian.com' + url_list[i]
    img_name = ('./img/' + url_list[i].split('/')[-1])
    imgdata = requests.get(url=url_path,headers=headers).content
    with open(img_name,'wb') as fp:
        fp.write(imgdata)
    print(img_name + " download success\n")
for i in range(2,137):
    url = 'https://pic.netbian.com/4kdongman/index_' + str(i) + '.html'
    response = requests.get(url=url, headers=headers)
    response.encoding = 'gbk'
    page_text = response.text
    tree = etree.HTML(page_text)
    url_list = tree.xpath('//div[@class="slist"]/ul[@class="clearfix"]//img/@src')
    if not os.path.exists('./img'):
        os.mkdir('./img')
    for i in range(len(url_list)):
        url_path = 'https://pic.netbian.com' + url_list[i]
        img_name = ('./img/' + url_list[i].split('/')[-1])
        imgdata = requests.get(url=url_path, headers=headers).content
        with open(img_name, 'wb') as fp:
            fp.write(imgdata)
        print(img_name + " download success\n")

需求:爬取全国城市的全部名称
 

import requests
from lxml import etree

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0'
    ,
}
url = 'https://www.aqistudy.cn/historydata/'
response = requests.get(url=url,headers=headers)
page_text = response.text
tree = etree.HTML(page_text)
city_list = tree.xpath('//ul[@class="unstyled"]//a/text()')
unique_city_list = []
for city in city_list:
    if city not in unique_city_list:
        unique_city_list.append(city)
print(city_list)
print(len(unique_city_list))
fp = open('./city.txt','w',encoding='utf-8')
for city in unique_city_list:
    fp.write(city + '\n')
fp.close()

  • 2
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值