【爬虫实战案例】python数据爬取

最新推荐文章于 2024-06-05 17:45:18 发布

Wency(王斯-CUEB)

最新推荐文章于 2024-06-05 17:45:18 发布

阅读量339

点赞数 1

分类专栏：爬虫文章标签： python 爬虫 request xpath

本文链接：https://blog.csdn.net/weixin_43213884/article/details/121643371

版权

爬虫专栏收录该内容

8 篇文章 0 订阅

订阅专栏

导入所需要的库

import requests
from bs4 import BeautifulSoup
import pprint
import json
from lxml import etree

1.爬取所有页面的HTML

def download_htmls():
    """
    下载所有的网页html
    """
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0',
    'Accept': '*/*',
    'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
    'Accept-Encoding': 'br',
    'X-Requested-With': 'XMLHttpRequest',
    'Connection': 'keep-alive',
    }
    import brotli

    htmls = [] # 用列表来装数据
    for idx in range(10):
        # 先设定爬取前10页面
        url = f'http://www.tanpaifang.com/tanzhonghe/list_14_{idx+1}.html'
        print('爬取第{}页，网页url：{}'.format(idx+1,url))
        # res = requests.get(url,headers)
        res = requests.get(url, headers=headers, allow_redirects=True, verify=True) # 提交网址
        htmlData = res.text.encode('ISO-8859-1').decode('utf-8') # 编码在解码防止乱码
        print(res.headers['content-type'])
        print(res.encoding)
        print(res.apparent_encoding)
        if res.status_code != 200:
            raise Exception('error!!')
        htmls.append(htmlData)
    return htmls
# 调用函数，执行爬取
htmls = download_htmls()

2.解析HTML得到目标数据

def parse_single_html(html):
    """
    解析单个HTML，需要返回目标数据。即是：title，source，time，link
    """

    # 定义几个空列表和一个空字典来装
    titles = []
    links = []
    sources = []
    times = []
    data_crew = []
    
    html = etree.HTML(html)
    results = html.xpath(".//div[@class='left_list_box']/h1/a[@class='title']")
    for result in results:
        title = result.get('title') # title
        link = result.get('href') # link
        source = html.xpath(".//div[@class='banquan']/span/text()")[0] # source
        time = html.xpath(".//div[@class='banquan']/span/text()")[1] # link
        print(time)
        data = {'titles':title,'links':link,'sources':source,'times':time}
        data_crew.append(data)
    return data_crew

all_datas = []
for html in htmls:
    print('=='*30)
    all_datas.extend(parse_single_html(html))

============================================================
============================================================
============================================================
============================================================
============================================================
============================================================
============================================================
============================================================
============================================================
============================================================

import pandas as pd
df = pd.DataFrame(all_datas)
print(df)
df.to_excel(r'F:\桌面\xxx11.xlsx')

Wency(王斯-CUEB)

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
打赏
0
评论
【爬虫实战案例】python数据爬取

导入所需要的库import requestsfrom bs4 import BeautifulSoupimport pprintimport jsonfrom lxml import etree1.爬取所有页面的HTMLdef download_htmls(): """ 下载所有的网页html """ headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0
复制链接

扫一扫