python 爬虫基础 + 豆瓣 + 图片

爬虫基础知识 Request BeautifulSoup使用

import requests  # 导入requests库,需要安装
from bs4 import BeautifulSoup
# html 文件
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

# url = "https://www.jqhtml.com/13272.html"
# res = requests.get(url)
# print(res.status_code)

soup = BeautifulSoup(html_doc, 'lxml')  #选lxml更有效
# print(soup.prettify())  #用prettify()函数自动补全
# 获取标签、名称、属性、属性、内容
print(soup.title ,soup.title.name,
      soup.a['class'],soup.p.attrs['class'],
      soup.title.string,
      sep='\n', end= '\n')

common_use = """其他常见用法
find_all(name, attrs, recursive, text, **kwargs)搜索当前tag子节点,并判断是否符合过滤器的条件,列表形式返回
find( name , attrs , recursive , string , **kwargs ) find()返回单个元素
select() CSS选择器标签 class类名加”.“,id属性加”#“,传入字符串参数,
"""

print( '0', soup.find_all("title"), # 找到标签中含有title
       '1',soup.find_all("p", "story"), #p标签中属性为story
       '2',soup.find_all("a"),  #找到所有含a的标签
       '3', soup.find_all(id="link2"), #获取到link2的a标签
       '4',soup.find_all('p', class_='title'),
       '5',soup.find_all('a', id='link2'), #获取到id为link2的a标签
       '6',soup.find_all("a", limit=2), #满足的有3个,我们只想要得到2个
        sep='\n', end= '\n')

print('0', soup.find('title'),
        '1', soup.find('a'), #返回一个结果
      sep='\n', end= '\n')

print('0', soup.select('title'),
        '1', soup.select('body a'), #通过tag逐层查找
        '2',soup.select("p>a"), #找到某个 tag标签下的直接子标签
      '3', soup.select(".sister"), #通过CCC的class类名查找
      '4',soup.select("a#link2"), #通过 tag 的 id 查找:
      sep='\n', end= '\n')

"""提取标签内容"""
lists = soup.find_all("a",class_='sister') #找到所有含有sister的a标签
i=0
for list in lists:
    print(i)
    print(list)     #打印标签
    print(list.get_text()) #使用get_text()方法获得标签内容,文本信息

    print(list['href'], list['id'], list['class'])  # 获得标签href的内容

    i=i+1

爬取豆瓣

import requests
from bs4 import BeautifulSoup

url = "https://book.douban.com/top250?start=0"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36'
}
res = requests.get(url, headers=headers)
# print(res.status_code)

"""数据分析,获取"""
soup = BeautifulSoup(res.text, 'lxml')

#获取图书名称- title
titles = [],  rating = [], data = []
book_names = soup.find_all('div', class_= 'pl2')
for book_name in book_names:
    a = book_name.find('a')
    titles.append(a['title']) #爬取图书名称
print(titles)

# 获取作者信息-author
author_info = soup.find_all('p', 'pl')
authors = [author.get_text() for author in author_info]
#print(authors)

#获取评分-rate

allstars = soup.find_all('span', class_="rating_nums")
for rate in allstars:
    rating.append(rate.get_text())
# print(rating)

#获取图书简介-info
book_info = soup.find_all('span', 'inq')
book_infos = [info.get_text() for info in book_info]
# print(book_infos)

#整合数据,以字典形式存储

for title, author, rate, info in zip(titles, authors, rating, book_infos):
    title = '书名: ' + str(title) + '\n'
    author = '作者: ' + str(author) + '\n'
    rate = '评分: ' + str(rate) + '\n'
    info = '简介: ' + str(info) + '\n'
    data.append(title + author + rate + info)

# 文件名
filename = '豆瓣图书Top250.txt'
# 保存文件操作
with open(filename, 'w', encoding='utf-8') as f:
    # 保存数据
    f.writelines(data)
print('保存成功')

图片

有的网站时动态的,查看时可以查看到具体的标签和代码,会保护不让爬取。这时,我们需要用到抓包,获取到完整的代码

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值