数据哪里来——01

1——Python3.7+Pycharm

  1. lxml
  2. requests
  3. re正则表达式

2——目标1(下载京客隆的数据)

# author: mmm
# datetime:2022/6/22 14:15 周三
# software: PyCharm
"""
文件说明:
"""
import pandas as pd
import requests
from lxml import etree
url = 'http://www.jkl.com.cn/shop.aspx'
ua = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
# 1.1 配置信息
response = requests.get(url, headers=ua).text
city_url = etree.HTML(response).xpath('//div[@class="infoLis"]//@href')
for each in city_url:
    # 1.2 拿到每个city的页面url
    city_url = 'http://www.jkl.com.cn/' + each
    # print(city_url)
    city_response = requests.get(url=city_url, headers=ua).text
    店铺名称 = etree.HTML(city_response).xpath('//span[@class="con01"]/text()')
    经营地址 = etree.HTML(city_response).xpath('//span[@class="con02"]/text()')
    电话 = etree.HTML(city_response).xpath('//span[@class="con03"]/text()')
    营业时间 = etree.HTML(city_response).xpath('//span[@class="con04"]/text()')
    name_new = []
    for names in 店铺名称:
        names = names.strip()
        name_new.append(names)
    # print(name_new)
    # 1.3 将数据存入excel
    # data = pd.DataFrame({'name': 店铺名称, 'address': name_new, 'phone': 电话, 'time': 营业时间})
    # data.to_csv('京客隆1.csv', index=False, header=0, mode='a', encoding="gbk")
    ---  备注:保存的csv结果是乱码,各种方法都尝试了,还是不好使[utf_8, utf_8_sig, gbk, ANSI];直接保存到py文件同目录

数据data

3——目标2(下载海报)

# 2.1 观察海报页数有5页,直接写一个循环
i = 1
j = 1
while i <= 5:
    url2 = 'http://www.jkl.com.cn/phoLis.aspx?current=' + str(i)
    response2 = requests.get(url=url2, headers=ua).text
    res_data = etree.HTML(response2).xpath('//div[@class="proLis"]//img/@src')
    for img in res_data:
        url_2 = 'http://www.jkl.com.cn' + img
        img_data = requests.get(url_2, headers=ua).content  # 二进制形式的图片
        img_name = 'c://mmm/海报' + str(j) + '.jpg'          # c盘下的mmm目录必须存在,否则报错
        j += 1
        with open(img_name, 'wb') as poster:
            poster.write(img_data)
            print(img_name + '下载完毕..........')
    i += 1

imgs海报

3——目标3(下载报告)

  • 老师讲的方法是使用zip()函数,将文件链接和文件名对应上
  • 个人直接使用的enumerate()方法,因为ul/li下的标签是一一对应的
# author:
# datetime:2022/6/22 19:30周三
# software: PyCharm
"""
文件说明:
"""
import requests
from lxml import etree
for i in range(4): # 有4页的内容
    url = 'http://www.jkl.com.cn/newsList.aspx?TypeId=10009&current=' + str(i)
    ua = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
    response = requests.get(url, headers=ua).text
    datas = etree.HTML(response).xpath('//div[@class="newsLis"]/ul/li/a/@href')
    names = etree.HTML(response).xpath('//div[@class="newsLis"]/ul/li/a/text()')
    for index, n in enumerate(names):
        n = n.strip()
        suffix = datas[index].split('.')[-1]  # 获取文件扩展名,很严谨的思想
        url = 'http://www.jkl.com.cn' + datas[index]
        file_name = 'c://mmm/财务资料/' + n + '.' + suffix
        print(file_name)
        material = requests.get(url=url, headers=ua).content
        with open(file_name, 'wb') as ff:
            ff.write(material)
            print(file_name + '下载完成了..........')

file文件

4——目标4(智能查找页码)

# author:
# datetime:2022/6/22 20:30周三
# software: PyCharm
"""
文件说明:
"""
import requests
from lxml import etree
import re
url = 'http://www.jkl.com.cn/newsList.aspx?TypeId=10011'
ua = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
response = requests.get(url, headers=ua).text
last_page = etree.HTML(response).xpath('//a[text()="尾页"]/@href')
if last_page:
    pos = re.search('current=(\d+)&', last_page[0])
    pages = pos.group(1)  # group(0)等价于group()
    # print(pos.group(), pos.group(0), pos.group(1))
else:
    pages = 1  # 此时只有一页
print(pages)

5——完整部分

# author:
# datetime:2022/6/23 22:30周四
# software: PyCharm
"""
文件说明:
"""
import requests
from lxml import etree
import re
import os
url = 'http://www.jkl.com.cn/newsList.aspx?TypeId=10009'
ua = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
response = requests.get(url, headers=ua).text
# 1 获取第一级的标题和对应的链接
link1 = etree.HTML(response).xpath('//div[@class="infoLis"]//a/@href')
title1 = etree.HTML(response).xpath('//div[@class="infoLis"]//a/text()')
# print(link1[0], title1[0])
# 2 获取每个第一级链接的数据[文件名称及链接,页码]
for d1 in link1:
    v1 = link1.index(d1)
    path1 = 'c://mmm/' + title1[v1]  # 外面一层最大的文件夹
    # print(path1)
    if not os.path.exists(path1):
        os.makedirs(path1)
    url1 = 'http://www.jkl.com.cn/' + d1
    # 2.1 获取页码pages1
    # print(url1)
    res1 = requests.get(url1).text
    url1 = etree.HTML(res1).xpath('//a[text()="尾页"]/@href')
    # print(url1)
    if url1:
        pos1 = re.search('current=(\d+)&', url1[0])
        pages1 = int(pos1.group(1))  # group(0)等价于group()
        # print(pos.group(), pos.group(0), pos.group(1))
        # print(pages1)
    else:
        pages1 = 1  # 此时只有一页
        # print(pages1)
    # print(pages1)
    # 2.2 获取每一页的数据
    if pages1 >= 2:
        for p1 in range(1, pages1+1):
            # print(p1)
            url2 = 'http://www.jkl.com.cn/newsList.aspx?current={}&TypeId=10009'.format(str(p1))
            # print(url2)
            temp = requests.get(url2, ua).text
            data2 = etree.HTML(response).xpath('//div[@class="newsLis"]/ul/li/a/@href')
            name2 = etree.HTML(response).xpath('//div[@class="newsLis"]/ul/li/a/text()')
            for index, n in enumerate(name2):
                n = n.strip()
                n = n.replace('...', '报表')
                suffix = data2[index].split('.')[-1]  # 获取文件扩展名,很严谨的思想
                url3 = 'http://www.jkl.com.cn' + data2[index]
                file_name = path1 + '/' + n + '.' + suffix
                print(file_name)
                # material = requests.get(url=url3, headers=ua).content
                # with open(file_name, 'wb') as ff:
                #     ff.write(material)
                #     print(file_name + '下载完成了..........')
    else:
        response = requests.get('http://www.jkl.com.cn/' + d1, headers=ua).text
        data2 = etree.HTML(response).xpath('//div[@class="newsLis"]/ul/li/a/@href')
        name2 = etree.HTML(response).xpath('//div[@class="newsLis"]/ul/li/a/text()')
        for index, n in enumerate(name2):
            n = n.strip()
            n = n.replace('...', '报表')
            n = n.replace('/', '')
            suffix = data2[index].split('.')[-1]  # 获取文件扩展名,很严谨的思想
            if not suffix:  # 没有扩展名,不是真正的文件链接
                pass
            else:
                url4 = 'http://www.jkl.com.cn' + data2[index]
                file_name = path1 + '/' + n + '.' + suffix
                print(file_name)
                # material = requests.get(url=url4, headers=ua).content
                # with open(file_name, 'wb') as ff:
                #     ff.write(material)
                #     print(file_name + '下载完成了..........')

6——复盘与总结

跟学[孙兴华]老师,收获很大,加油鸭打工人提高效率早下班

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值