【爬虫实战案例】python数据爬取

导入所需要的库

import requests
from bs4 import BeautifulSoup
import pprint
import json
from lxml import etree

1.爬取所有页面的HTML

def download_htmls():
    """
    下载所有的网页html
    """
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0',
    'Accept': '*/*',
    'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
    'Accept-Encoding': 'br',
    'X-Requested-With': 'XMLHttpRequest',
    'Connection': 'keep-alive',
    }
    import brotli

    htmls = [] # 用列表来装数据
    for idx in range(10):
        # 先设定爬取前10页面
        url = f'http://www.tanpaifang.com/tanzhonghe/list_14_{idx+1}.html'
        print('爬取第{}页,网页url:{}'.format(idx+1,url))
        # res = requests.get(url,headers)
        res = requests.get(url, headers=headers, allow_redirects=True, verify=True) # 提交网址
        htmlData = res.text.encode('ISO-8859-1').decode('utf-8') # 编码在解码防止乱码
        print(res.headers['content-type'])
        print(res.encoding)
        print(res.apparent_encoding)
        if res.status_code != 200:
            raise Exception('error!!')
        htmls.append(htmlData)
    return htmls
# 调用函数,执行爬取
htmls = download_htmls()

2.解析HTML得到目标数据

def parse_single_html(html):
    """
    解析单个HTML,需要返回目标数据。即是:title,source,time,link
    """

    # 定义几个空列表和一个空字典来装
    titles = []
    links = []
    sources = []
    times = []
    data_crew = []
    
    html = etree.HTML(html)
    results = html.xpath(".//div[@class='left_list_box']/h1/a[@class='title']")
    for result in results:
        title = result.get('title') # title
        link = result.get('href') # link
        source = html.xpath(".//div[@class='banquan']/span/text()")[0] # source
        time = html.xpath(".//div[@class='banquan']/span/text()")[1] # link
        print(time)
        data = {'titles':title,'links':link,'sources':source,'times':time}
        data_crew.append(data)
    return data_crew

    
all_datas = []
for html in htmls:
    print('=='*30)
    all_datas.extend(parse_single_html(html))

============================================================
============================================================
============================================================
============================================================
============================================================
============================================================
============================================================
============================================================
============================================================
============================================================
import pandas as pd
df = pd.DataFrame(all_datas)
print(df)
df.to_excel(r'F:\桌面\xxx11.xlsx')
  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Wency(王斯-CUEB)

我不是要饭的

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值