代理和css选择器解析库

最新推荐文章于 2024-08-07 10:08:11 发布

叫我星娃子

最新推荐文章于 2024-08-07 10:08:11 发布

阅读量73

点赞数

本文链接：https://blog.csdn.net/qq_42273268/article/details/117534205

版权

获取代理ip

import requests


# 请求获取代理
def get_proxy_ips():
    api = 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=5&expiryDate=0&format=2&newLine=3'
    res = requests.get(api)
    if res.status_code == 200:
        if res.text[0] == '{':
            print('获取代理失败')
        else:
            return res.text.split('\n')[:-1]
    else:
        print('请求失败')


def get_net_data():
    url = 'https://movie.douban.com/top250'
    # 请求头
    headers ={
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
    }
    # 代理
    ips = get_proxy_ips()
    if ips:
        proxies = {
            'http': ips[0],
            'https': ips[0]
        }
        res = requests.get(url, headers=headers, proxies=proxies, timeout=2)
        if res.status_code == 200:
            print(res.text)
        else:
            print('数据请求失败')
    else:
        print('没有成功获取到代理')


if __name__ == '__main__':
    get_net_data()

优化

import requests
import time


# 请求获取代理
def get_proxy_ips():
    api = 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=4&expiryDate=0&format=2&newLine=3'
    res = requests.get(api)
    if res.status_code == 200:
        if res.text[0] == '{':
            print('获取代理失败,提取太频繁。')
        else:
            return res.text.split('\n')[:-1]
    else:
        print('ip请求失败')


def get_net_data(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
    }
    #
    while True:
        # 获取代理ip
        ips = get_proxy_ips()
        # 如果没有取到
        if not ips:
            print('ip获取失败')
            time.sleep(1)
            continue

        while ips:
            ip1 = ips.pop()
            ip2 = ips.pop()
            proxies = {
                'http': ip1,
                'https': ip2
            }
            try:
                res = requests.get(url, headers=headers, proxies=proxies, timeout=2)
                if res.status_code == 200:
                    # print(res.text)
                    return res.text
                else:
                    print('数据请求失败')
            except (requests.exceptions.ProxyError, requests.exceptions.ConnectTimeout):
                print('超时, 请继续')


if __name__ == '__main__':
    result = get_net_data('https://movie.douban.com/top250')
    print(result)

bs4的使用

import requests
import time


# 请求获取代理
def get_proxy_ips():
    api = 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=4&expiryDate=0&format=2&newLine=3'
    res = requests.get(api)
    if res.status_code == 200:
        if res.text[0] == '{':
            print('获取代理失败,提取太频繁。')
        else:
            return res.text.split('\n')[:-1]
    else:
        print('ip请求失败')


def get_net_data(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
    }
    #
    while True:
        # 获取代理ip
        ips = get_proxy_ips()
        # 如果没有取到
        if not ips:
            print('ip获取失败')
            time.sleep(1)
            continue

        while ips:
            ip1 = ips.pop()
            ip2 = ips.pop()
            proxies = {
                'http': ip1,
                'https': ip2
            }
            try:
                res = requests.get(url, headers=headers, proxies=proxies, timeout=2)
                if res.status_code == 200:
                    # print(res.text)
                    return res.text
                else:
                    print('数据请求失败')
            except (requests.exceptions.ProxyError, requests.exceptions.ConnectTimeout):
                print('超时, 请继续')


if __name__ == '__main__':
    result = get_net_data('https://movie.douban.com/top250')
    print(result)

练习-爬取贝壳房源简单信息

import requests
from bs4 import BeautifulSoup
import csv


def get_net_data(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
    }
    res = requests.get(url, headers=headers)
    if res.status_code == 200:
        return res.text
    print(res)


def analysis_data(html: str):
    bs = BeautifulSoup(html, 'lxml')
    house_lis = bs.select('.resblock-list-wrapper>li')
    all_house = []
    for li in house_lis:
        house = {}
        img_src = li.select_one('.lj-lazy').attrs['data-original']
        house['image'] = img_src

        name = li.select_one('.name').get_text()
        house['name'] = name

        price = li.select_one('.main-price').get_text().replace('\n', '')
        house['price'] = price

        location = li.select_one('.resblock-location').get_text().strip()
        house['location'] = location

        all_house.append(house)
    # print(all_house)
    with open('files/beike.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, ['image', 'name', 'price', 'location'])
        # writer.writerow({'image': '图片', 'name': '名称', 'address': '地址', 'price': '房价'})
        writer.writeheader()
        writer.writerows(all_house)


if __name__ == '__main__':
    analysis_data(get_net_data('https://cd.fang.ke.com/loupan/pg1/'))

叫我星娃子

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
代理和css选择器解析库

获取代理ipimport requests# 请求获取代理def get_proxy_ips(): api = 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=5&expiryDate=0&format=2&newLine=3' res = requests.get(api) if res.statu.
复制链接

扫一扫