python爬⾍笔试

诗仙跑的贼快

已于 2023-03-05 14:49:00 修改

阅读量41

点赞数

分类专栏： python爬虫文章标签： python 爬虫

于 2023-03-05 14:08:09 首次发布

本文链接：https://blog.csdn.net/shixianzuishuai/article/details/129345460

版权

python爬虫专栏收录该内容

1 篇文章 0 订阅

订阅专栏

python爬⾍笔试

列表⻚：
1> ⽂章标题
2> 摘要
3> 时间
4> 列表中⽂章的图⽚
在这里插入图片描述

详情⻚:
1> ⽂章题⽬
2> 原⽂标题
3> 原⽂来源
4> ⽂章正⽂
5> 原⽂链接
在这里插入图片描述

1.尝试爬取地址为:https://www.theblockbeats.info/topic/87 下⾯的10篇⽂
章并使⽤beautiful soup解析到以下内容
2
[{‘title_1’:{‘summary’:‘’, ‘time’:‘’, ‘icon’: ‘’}},…]
得到结果格式:
列表⻚：
详情⻚:
3
[{‘title’: ‘’, ‘original_title’:‘’, ‘original_source’:‘’, ‘content’: ‘’, ‘original_link’: ‘’ }, …]

# -*- codeing = utf-8 -*-
from bs4 import BeautifulSoup  # 网页解析
import urllib.request, urllib.error  # URL操作，获取网页数据


# 爬虫函数
def crawl(url, headers):
    page = urllib.request.Request(url, headers=headers)
    response = urllib.request.urlopen(page)
    html = response.read().decode("utf-8")
    # 利用 BeautifulSoup 标准库，解析页面信息
    soup = BeautifulSoup(html, "html.parser")
    liebiao = []
    i = 0
    for item in soup.find_all('div', class_="news-flash-components")[i]:
        # 标题
        liebiao.append({})
        liebiao[i]['title'] = item.a["title"]
        # print(item.div.a["title"])
        # 摘要
        liebiao[i]['summary'] = item.find('div', class_='home-news-lft-content text-ellipsis2').text
        # 时间
        liebiao[i]['time'] = item.find('div', class_='home-news-time').text
        # 图片
        liebiao[i]['icon'] = item.div.img["src"]
        i += 1
        return liebiao


if __name__ == '__main__':
    url = "https://www.theblockbeats.info/topic/87"
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
               AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
    print(crawl(url, headers))

# -*- codeing = utf-8 -*-
from bs4 import BeautifulSoup
import urllib.request, urllib.error


# 爬虫函数
def crawl(url, headers):
    page = urllib.request.Request(url, headers=headers)
    response = urllib.request.urlopen(page)
    html = response.read().decode("utf-8")
    # 解析页面信息
    soup = BeautifulSoup(html, "html.parser")
    liebiao1 = []
    # 获取子页面链接
    i = 0
    for item in soup.find_all('div', class_="news-flash-components")[i]:
        # 标题
        liebiao1.append({})
        liebiao1[i] = item.a["href"]
        # print(item.div.a["title"])
        i += 1
    return liebiao1


def crawlx(url, headers):
    page = urllib.request.Request(url, headers=headers)
    response = urllib.request.urlopen(page)
    html = response.read().decode("utf-8")
    # 解析页面信息
    soup = BeautifulSoup(html, "html.parser")
    liebiao = {'title': soup.find('div', class_="news-title").text}
    i = 0
    for item in soup.find_all('blockquote'):
        if i == 0:
            liebiao['original_title'] = item.text
            i += 1
        elif i == 1:
            liebiao['original_source'] = item.text
            i += 1
        else:
            break
    liebiao['content'] = str(soup.find_all('p'))
    liebiao['original_link'] = soup.find_all('blockquote')[-1]

    return liebiao


if __name__ == '__main__':
    jieguo = []
    url = "https://www.theblockbeats.info/topic/87"
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
               AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
    listx = crawl(url, headers)
    # 详情页
    for i in range(len(listx)):
        listx[i] = "https://www.theblockbeats.info" + listx[i]
    for i in range(10):
        url = listx[i]
        jieguo.append(crawlx(url, headers))
    print(jieguo)