爬虫网页3

def main():
    """主函数"""
    print("Weather test")
    # 珠海
    url1 = 'http://www.weather.com.cn/weather/101200101.shtml'  # 7天天气中国天气网
    url2 = 'http://www.weather.com.cn/weather15d/101200101.shtml'  # 8-15天天气中国天气网

    html1 = getHTMLtext(url1)
    data1, data1_7 = get_content(html1)  # 获得1-7天和当天的数据

    html2 = getHTMLtext(url2)
    data8_14 = get_content2(html2)  # 获得8-14天数据
    data14 = data1_7 + data8_14
    # print(data)
    write_to_csv('weather14.csv', data14, 14)  # 保存为csv文件
    write_to_csv('weather1.csv', data1, 1)


if __name__ == '__main__':
    main()

from lxml import etree
import requests
import os

if __name__ == '__main__':
    url = 'https://www.vcg.com/creative-image/xigua/'
    ##url = 'https://pic.netbian.com/4kdongman/'
    header = {
            'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36 Edg/102.0.1245.39'
    }

    response = requests.get(url=url, headers=header)
    response.encoding = 'utf-8'
    page_text = response.text
    tree = etree.HTML(page_text)
    figure_list = tree.xpath('//div[@class="gallery_inner"]/figure')
    if not os.path.exists('./piclitl'):
        os.makedirs('./piclitl')
    for figure in figure_list:
        try:
            img_src = figure.xpath('./a/img/@data-src')[0]
        except(IndexError):
            print('未成功匹配到字段')
        img_src = 'https:' + img_src
        img_name = img_src.split('/')[-1]
        try:
            img_data = requests.get(url=img_src, headers=header).content
        except(requests.exceptions.InvalidURL):
            print('没有访问地址')
        img_path = 'piclitl/' + img_name
        with open(img_path, 'wb') as fp:
            fp.write(img_data)
            print(img_name, '下载成功')


保存图片2

from lxml import etree
import requests
import os

if __name__ == '__main__':
    host = 'https://pic.netbian.com/'
    url = 'https://pic.netbian.com/4kdongman/'
    header = {
            'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36 Edg/102.0.1245.39'
    }

    response = requests.get(url=url, headers=header)
    response.encoding = 'utf-8'
    page_text = response.text
    tree = etree.HTML(page_text)
    figure_list = tree.xpath('//div[@class="slist"]/ul[@class="clearfix"]/li')
    if not os.path.exists('./piclitl2'):
        os.makedirs('./piclitl2')
    for figure in figure_list:
        try:
            img_src = figure.xpath('./a/img/@src')[0]
            print(img_src)
        except(IndexError):
            print('未成功匹配到字段')
        img_src = host + img_src
        img_name = img_src.split('/')[-1]
        try:
            img_data = requests.get(url=img_src, headers=header).content
        except(requests.exceptions.InvalidURL):
            print('没有访问地址')
        img_path = 'piclitl2/' + img_name
        with open(img_path, 'wb') as fp:
            fp.write(img_data)
            print(img_name, '下载成功')
 

  • 4
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值