python爬虫实战(一)～爬取百度百科人物的文本+图片信息+Restful api接口

天狼啸月1990

已于 2024-06-06 10:10:39 修改

阅读量825

点赞数

分类专栏：爬虫 # python programming 文章标签：爬虫 python编程

于 2021-05-30 08:41:00 首次发布

原文链接：https://www.bilibili.com/video/BV1Kx411Q7gE?from=search&seid=11300633336131170833

版权

python programming 同时被 2 个专栏收录

12 篇文章 1 订阅

订阅专栏

爬虫

5 篇文章 1 订阅

订阅专栏

我的github地址：GitHub - yuyongsheng1990/python_spider_from_bdbaike

# -*- coding: UTF-8 -*-
# @Project -> File: python_spider_from_bdbaike -> spider_baike_text_picture
# @Time: 2021/6/3 20:13 
# @Description: 从百度百科爬取人物的基本信息、信息框数据和图片
import os
from urllib.request import urlretrieve
import urllib.parse
from urllib.error import HTTPError

import requests
from bs4 import BeautifulSoup
from lxml import etree
import re
import xlwt
import xlrd
from xlutils.copy import copy

# 防止ssl报错
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

# 爬虫程序
def claw(content):
    # 访问、下载html网页
    url = 'https://baike.baidu.com/item/' + urllib.parse.quote(content)      # 请求地址
    # 请求头部，伪造浏览器，防止爬虫被反
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    }
    # 利用请求地址和请求头部构造请求对象
    req = urllib.request.Request(url=url, headers=headers, method='GET')
    response = urllib.request.urlopen(req)      # 发送请求，获得响应
    text = response.read().decode('utf-8')      # 读取响应，获得文本
    # ----------------------------------------------------------------------------------------------------
    # 解析html网页
    soup = BeautifulSoup(text, 'lxml')  # 创建soup对象，获取html源码

    intro_tag = soup.find_all('div', class_="lemma-summary")  # 获取百科基本信息列表
    name_tag = soup.find_all('dt', class_="basicInfo-item name")  # 找到所有dt标签，返回一个标签列表
    value_tag = soup.find_all('dd', class_="basicInfo-item value")  # 找到所有dd标签，返回一个标签列表

    # 处理基本信息：过滤数据，去掉空白
    intro_after_filter = [re.sub('\n+', '', item.get_text()) for item in intro_tag]
    intro_after_filter = [''.join(i.split()) for i in intro_after_filter]  # 去除/0a乱码
    # 将字符串列表连成字符串并返回
    intro_after_filter = ''.join(intro_after_filter)
    # print(intro_after_filter)

    # 抽取信息框数据
    profile_info = {}
    namelist = []
    valuelist = []

    for i in name_tag:  # 将所有dt标签内容存入列表
        name = i.get_text()
        name = ''.join(name.split())  # 去除/0a乱码
        namelist.append(name)
    for i in value_tag:  # 将所有dd标签内容存入列表
        value = i.get_text().strip(' ')
        # value = re.sub('\n+', '、', i.get_text()).strip('、')  # 老师不让删除换行符
        # value = ''.join(value.split())  # 删除可能存在的乱吗/0a，但一块把空格删除了，实际上不需要
        print(value)
        valuelist.append(value)
    for i, j in zip(namelist,
                    valuelist):  # 多遍历循环，zip()接受一系列可迭代对象作为参数，将对象中对应的元素打包成一个个tuple（元组），然后返回由这些tuples组成的list（列表）。
        profile_info[i] = j
    # print(profile_info)

    # 爬取图片
    # 找到所有img标签，返回一个url的标签列表
    img_urllist = []
    resp = requests.get(url=url, headers=headers)
    content = resp.content
    soup = BeautifulSoup(content, 'lxml')
    # img_list = soup.select('div .album-wrap')
    img_list = soup.select('a>div>img')
    # print(img_list)
    for img in img_list:
        try:
            # src = img.find('img').get('src')
            src = img.get('src')
            if re.match(r'https:(.*)image(.*)auto$', src):
                img_urllist.append(src)
        except:
            continue

    # print(img_urllist)
    return intro_after_filter, profile_info, img_urllist

# 下载爬到的数据：基本信息、信息框、图片
def download(name, intro, profile_dict, img_list):
    project_path = os.getcwd()
    # print('project_path:' + project_path)

    # 保存百科基本信息
    if not os.path.exists('introduction'):
        os.mkdir('introduction')
    introduction_file = project_path + '/introduction/' + name + '.txt'
    # print(introduction_file)
    if not os.path.exists(introduction_file):
        with open(introduction_file, 'x') as f:
            f.write(intro)
    else:
        with open(introduction_file, 'w') as f:
            f.write(intro)
    # print('introduction输出完毕')

    # 保存信息框数据到excel
    if not os.path.exists('profile'):
        os.mkdir('profile')

    profile_file = project_path + '/profile/' + 'profile.csv'
    field_list = ['中文名', '外文名', '别名', '性别', '学位', '职称', '国籍', '民族', '出生地', '籍贯', '出生日期', '逝世日期',
                  '星座', '血型', '身高','体重', '毕业院校', '职业', '经纪公司', '代表作品', '主要成就', '生肖', '语种', '特长', '粉丝名']
    if not os.path.exists(profile_file):
        workbook = xlwt.Workbook(encoding='utf-8')
        output_sheet = workbook.add_sheet('profile_sheet', cell_overwrite_ok=True)
        for i in range(len(field_list)):
            output_sheet.write(0, i, field_list[i])
        workbook.save(profile_file)

    rb = xlrd.open_workbook(profile_file)
    rows_num = rb.sheet_by_name('profile_sheet').nrows
    # print(rows_num)
    wb = copy(rb)
    output_sheet = wb.get_sheet(0)
    # print(profile)
    for i in range(len(field_list)):
        if profile_dict.get(field_list[i]):
            output_sheet.write(rows_num, i, profile_dict.get(field_list[i]))
        else:
            continue
    os.remove(profile_file)
    wb.save(profile_file)

    # 保存图片
    # 请求头部，伪造浏览器，防止爬虫被反
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    }
    download_limit = 10  # 单个人物下载的最大图片数
    if not os.path.exists('img'):
        os.mkdir('img')
    name_path = project_path + '/img/' + name
    if not os.path.exists(name_path):
        os.mkdir(name_path)

    count = 1
    for img_url in img_list:
        try:
            response = requests.get(img_url, headers=headers)  # 得到访问的网址
            content = response.content
            filename = name_path + '/' + name + '_%s.jpg' % count
            with open(filename, "wb") as f:
                # 如果图片质量太差，跳过
                if len(content) < 1000:
                    continue
                f.write(content)  # 保存图片
            response.close()
            count += 1
            # 每个模特最多只下载download_limit张
            if count > download_limit:
                break

        except HTTPError as e:  # HTTP响应异常处理
            print(e.reason)


if __name__ == '__main__':
    trigger = True
    while (trigger):
        name = '潘建伟'  # input('查询词语：')
        intro, profile_dict, img_list = claw(name)
        download(name, intro, profile_dict, img_list)
        # print("查询结果：%s" % result)
        trigger = False

2. 人物履历等数据按json格式输出

2.1 json简介

json，通信格式，可读性强，却会添加冗余空白格 --> separator对数据进行压缩

2.2 json.dumps()方法参数

json.dumps()方法：

sort_keys参数：对dict对象进行排序，我们默认dict是无序存放的

一个合法的json文档：有大括号{}扩起来的对象(键值对)；由中括号[]括起来的数组
dist_city={
    1:{
    "city_id":01,
    "city_name":"北京",
    "area":["城东区","城南区"]
    },
    2:{
    "city_id":2,
    "city_name":"上海",
    "area":["浦东区","朝阳区"]
    }
}
{
    "$schema": "http://json-schema.org/draft-04/schema#",
    "type": "object",
    "properties": {
        "email": {
            "type": "string"
        },
        "firstName":{
            "type": "string"
        },
        "lastName": {
            "type": "string"
        },
    
    }
}
中文dict编码报错，ensure_ascii = False
skipkeys。dumps存储dict时，key必须是str，否则TypeError，如果Skipkeys=True-->屏蔽非str的键值对。
拒绝json.dumps()方法自动排序，sort_keys=False
json.dumps()方法输出自动换行缩进的数据格式，indent=4(值为缩进量)

3. python实现Restful框架的Flask接口

Flask官方教程文档：欢迎来到 Flask 的世界 — Flask中文文档(2.1.x)

python Restful API的Flask开发教程视频：3、Python RESTful API 开发_哔哩哔哩_bilibili

# 防止Flask实现的restful接口返回中文乱码
app.config['JSON_AS_ASCII'] = False

4. bs4读取table表格数据

可以借助pands.read_table()方法=>pandas dataframe数据格式。

怎么说呢，pandas很强大，没有你想不到的数据处理方式

天狼啸月1990

关注

0
点赞
踩
11

收藏

觉得还不错? 一键收藏
0
评论
python爬虫实战(一)～爬取百度百科人物的文本+图片信息+Restful api接口

json，通信格式，可读性强，却会添加冗余空白格 --> separator对数据进行压缩。
复制链接

扫一扫

专栏目录