代理和css选择器解析库

  • 获取代理ip
import requests


# 请求获取代理
def get_proxy_ips():
    api = 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=5&expiryDate=0&format=2&newLine=3'
    res = requests.get(api)
    if res.status_code == 200:
        if res.text[0] == '{':
            print('获取代理失败')
        else:
            return res.text.split('\n')[:-1]
    else:
        print('请求失败')


def get_net_data():
    url = 'https://movie.douban.com/top250'
    # 请求头
    headers ={
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
    }
    # 代理
    ips = get_proxy_ips()
    if ips:
        proxies = {
            'http': ips[0],
            'https': ips[0]
        }
        res = requests.get(url, headers=headers, proxies=proxies, timeout=2)
        if res.status_code == 200:
            print(res.text)
        else:
            print('数据请求失败')
    else:
        print('没有成功获取到代理')


if __name__ == '__main__':
    get_net_data()

优化

import requests
import time


# 请求获取代理
def get_proxy_ips():
    api = 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=4&expiryDate=0&format=2&newLine=3'
    res = requests.get(api)
    if res.status_code == 200:
        if res.text[0] == '{':
            print('获取代理失败,提取太频繁。')
        else:
            return res.text.split('\n')[:-1]
    else:
        print('ip请求失败')


def get_net_data(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
    }
    #
    while True:
        # 获取代理ip
        ips = get_proxy_ips()
        # 如果没有取到
        if not ips:
            print('ip获取失败')
            time.sleep(1)
            continue

        while ips:
            ip1 = ips.pop()
            ip2 = ips.pop()
            proxies = {
                'http': ip1,
                'https': ip2
            }
            try:
                res = requests.get(url, headers=headers, proxies=proxies, timeout=2)
                if res.status_code == 200:
                    # print(res.text)
                    return res.text
                else:
                    print('数据请求失败')
            except (requests.exceptions.ProxyError, requests.exceptions.ConnectTimeout):
                print('超时, 请继续')


if __name__ == '__main__':
    result = get_net_data('https://movie.douban.com/top250')
    print(result)
  • bs4的使用
import requests
import time


# 请求获取代理
def get_proxy_ips():
    api = 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=4&expiryDate=0&format=2&newLine=3'
    res = requests.get(api)
    if res.status_code == 200:
        if res.text[0] == '{':
            print('获取代理失败,提取太频繁。')
        else:
            return res.text.split('\n')[:-1]
    else:
        print('ip请求失败')


def get_net_data(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
    }
    #
    while True:
        # 获取代理ip
        ips = get_proxy_ips()
        # 如果没有取到
        if not ips:
            print('ip获取失败')
            time.sleep(1)
            continue

        while ips:
            ip1 = ips.pop()
            ip2 = ips.pop()
            proxies = {
                'http': ip1,
                'https': ip2
            }
            try:
                res = requests.get(url, headers=headers, proxies=proxies, timeout=2)
                if res.status_code == 200:
                    # print(res.text)
                    return res.text
                else:
                    print('数据请求失败')
            except (requests.exceptions.ProxyError, requests.exceptions.ConnectTimeout):
                print('超时, 请继续')


if __name__ == '__main__':
    result = get_net_data('https://movie.douban.com/top250')
    print(result)
  • 练习-爬取贝壳房源简单信息
import requests
from bs4 import BeautifulSoup
import csv


def get_net_data(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
    }
    res = requests.get(url, headers=headers)
    if res.status_code == 200:
        return res.text
    print(res)


def analysis_data(html: str):
    bs = BeautifulSoup(html, 'lxml')
    house_lis = bs.select('.resblock-list-wrapper>li')
    all_house = []
    for li in house_lis:
        house = {}
        img_src = li.select_one('.lj-lazy').attrs['data-original']
        house['image'] = img_src

        name = li.select_one('.name').get_text()
        house['name'] = name

        price = li.select_one('.main-price').get_text().replace('\n', '')
        house['price'] = price

        location = li.select_one('.resblock-location').get_text().strip()
        house['location'] = location

        all_house.append(house)
    # print(all_house)
    with open('files/beike.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, ['image', 'name', 'price', 'location'])
        # writer.writerow({'image': '图片', 'name': '名称', 'address': '地址', 'price': '房价'})
        writer.writeheader()
        writer.writerows(all_house)


if __name__ == '__main__':
    analysis_data(get_net_data('https://cd.fang.ke.com/loupan/pg1/'))
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值