day22-网络爬虫2

最新推荐文章于 2024-08-27 16:38:10 发布

??fengyu

最新推荐文章于 2024-08-27 16:38:10 发布

阅读量351

点赞数

文章标签： python

本文链接：https://blog.csdn.net/qq_46137199/article/details/117305125

版权

总结

1. 代理

"""
@Time:  2021/5/26 9:37
@Author:    三玖天下第一
"""

# api = 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=5&expiryDate=0
# &format=2&newLine=2'

import requests
import time

# http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=4&expiryDate=0&format=2&newLine=3

api = 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=4&expiryDate=0&format=2&newLine=3'


def get_proxy_ips(api=api):
    response = requests.get(api)
    if response.status_code == 200:
        if response.text[0] == '{':
            print('提取频繁请按照规定频率提取')
            time.sleep(2)
        else:
            return response.text.split('\n')[:-1]
    else:
        print('获取代理失败！')
        time.sleep(3)
        return get_proxy_ips(api)


def get_html(url, headers={}, proxies={}, proiesgen=None):
    try:
        response = requests.get(url, headers=headers, proxies=proxies)
        if response.status_code == 200:
            return response.text
        else:
            if proiesgen and not proiesgen.is_update(proies):
                new_proxies = proiesgen.updatefunc()
                return get_html(url, headers=headers, proxies=new_proxies)
    except (TimeoutError, requests.RequestException) as e:
        print('请求超时...', e)
        if proiesgen and not proiesgen.is_update(proies):
            new_proxies = proiesgen.updatefunc()
            return get_html(url, headers=headers, proxies=new_proxies)
    finally:
        response.close()


if __name__ == '__main__':
    url = 'https://movie.douban.com/top250'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
    }


    def ip_generator():
        while True:
            ips = get_proxy_ips()
            if not ips:
                continue
            for ip in ips:
                yield ip


    ipgen = ip_generator()


    def gen_proxies(gen=ipgen):
        ip = next(gen)
        return {
            'http': ip,
            'https': ip,
        }


    class ProxiesCls:

        def __init__(self):
            self.proxies = gen_proxies()

        def update_proxies(self):
            self.proxies = gen_proxies()
            return self.proxies

        def is_update(self, old):
            if old == self.proxies:
                return False
            return True

    proies = ProxiesCls()
    html_str = get_html(url, headers=headers, proxies=proies.proxies, proiesgen=proies)
    print(html_str)

2. bs4的使用

"""
@Time:  2021/5/26 11:29
@Author:    三玖天下第一
"""

import requests
from bs4 import BeautifulSoup

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
    'cookie': 'MUID=00CF3FB6A11967AD2CAC2FF5A0376609; MUIDB=00CF3FB6A11967AD2CAC2FF5A0376609; _EDGE_S=F=1&SID=0A4B569CE0856D7B2E9146DFE1AB6C6E; _EDGE_V=1; SRCHD=AF=QBRE; SRCHUID=V=2&GUID=BE1781471D3C485EB7054DFDCEF154F7&dmnchg=1; _SS=SID=0A4B569CE0856D7B2E9146DFE1AB6C6E; MUIDV=NU=1; SRCHUSR=DOB=20210526&T=1622000102000&TPC=1622000103000; ipv6=hit=1622003704776&t=4; SRCHHPGUSR=SRCHLANGV2=zh-Hans&BZA=0&BRW=XW&BRH=S&CW=1879&CH=469&DPR=1&UTC=480&DM=3&HV=1622000103&WTS=63757596902',
    'origin': 'https://cn.bing.com'
}


def get_net_data(url):
    try:
        response = requests.get(url, headers)
        if response.status_code == 200:
            return response.text
    finally:
        response.close()


def analysis_data(data: str, selector=''):
    # 1. 创建解析器
    bs = BeautifulSoup(data, 'lxml')

    # 2. 根据css选择器选择标签
    selector = selector or 'div#b_content'
    result = bs.select(selector)
    print(len(result))      # 1
    print(type(result))     # <class 'bs4.element.ResultSet'>
    print(type(result[0]))  # <class 'bs4.element.Tag'>
    print(result)

    # 3. 获取标签内容
    # 1）标签对象.string     -   获取标签的文字类容(如果标签内容中由多个子标签或者同时存在文字和子标签，结果就是None)
    selector = 'div#b_content h1'
    div = bs.select_one(selector)
    print(type(div))        # <class 'bs4.element.Tag'>
    print('div.string:', div.string)    # None
    # 2)标签对象.get_text() -   获取标签的文字内容（如果有子标签，会将子标签中的文字内容一起获取）
    print('div.text:', div.get_text())  # div.text: 没有与此相关的结果: c语言如何查看argv长度
    # 3)标签对象.contents   -   获取标签中的文字和子标签，返回值是列表（文字和子标签分开）
    print('div.contents', type(div.contents), div.contents)     # div.contents <class 'list'> ['没有与此相关的结果: ', <strong>c语言如何查看argv长度</strong>]

    # 4. 获取标签属性
    img = bs.select_one('img')
    # 1)标签对象.attrs[属性名]
    print(img)
    print(img.attrs['src'])

    # 5. 在指定标签中获取指定子标签
    # 1）指定子标签.select(selector)
    # 标签对象.select(css选择器)   -   获取指定标签中选择器选中的所有标签
    # 标签对象.select_one(css选择器)   -   获取指定标签中选择器选中的第一个标签
    div2 = div.select('span')
    print(div2)

if __name__ == '__main__':
    # https://cn.bing.com/search?q=c%E8%AF%AD%E8%A8%80%E5%A6%82%E4%BD%95%E6%9F%A5%E7%9C%8Bargv%E9%95%BF%E5%BA%A6&qs=n&form=QBRE&sp=-1&pq=c%E8%AF%AD%E8%A8%80%E5%A6%82%E4%BD%95%E6%9F%A5%E7%9C%8Bargv%E9%95%BF%E5%BA%A6&sc=0-13&sk=&cvid=760A7C0047094D159C8B1F6DE7426C96
    url = 'https://cn.bing.com/search?q=c%E8%AF%AD%E8%A8%80%E5%A6%82%E4%BD%95%E6%9F%A5%E7%9C%8Bargv%E9%95%BF%E5%BA%A6&qs=n&form=QBRE&sp=-1&pq=c%E8%AF%AD%E8%A8%80%E5%A6%82%E4%BD%95%E6%9F%A5%E7%9C%8Bargv%E9%95%BF%E5%BA%A6&sc=0-13&sk=&cvid=760A7C0047094D159C8B1F6DE7426C96'
    html_str = get_net_data(url)
    analysis_data(html_str)

3. csv操作

"""
@Time:  2021/5/26 16:56
@Author:    三玖天下第一
"""

import csv

# 1. 创建一个csv文件对象
# 1) csv.writer(文件对象)      -   以列表为单位写入一行数据
with open(r'./files/test.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)

# 1.1) 写入数据 -   用列表提供数据来进行写操作
    writer.writerow(['name', 'gender', 'age', 'score'])
    writer.writerows([['张三', '男', 27, 98], ['小酒', '女', 19, 97], ['媛媛', '女', 31, 89]])

# 2) csv.DictReader()  -   以字典为单位写入一行数据
with open(r'./files/test2.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, ['name', 'gender', 'age', 'score'])

    # 第一行内容
    writer.writerow({'name': '姓名', 'age': '年龄', 'gender': '性别', 'score': '分数'})

    # 写一行
    writer.writerow({'name': '张三', 'age': '16', 'gender': '男', 'score': '99'})

    writer.writerows([{'name': '小九', 'age': '26', 'gender': '女', 'score': '91'}, {'name': '王秀', 'age': '11', 'gender': '女', 'score': '69'}, {'name': '玖就', 'age': '26', 'gender': '女', 'score': '96'}])


# 2. 读取csv文件内容
with open(r'./files/test.csv', 'r', newline='', encoding='utf-8') as f:
    reader = csv.reader(f)
    print(type(reader))
    for x in reader:
        print(x)

with open(r'./files/test2.csv', 'r', newline='', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    print(type(reader))
    for x in reader:
        print(x)

作业

"""
@Time:  2021/5/26 15:19
@Author:    三玖天下第一
"""

from bs4 import BeautifulSoup
import requests
import re
import threading
import json
import csv

class MyThread(threading.Thread):
    def __init__(self, func, *args, **kwargs):
        super().__init__(args, kwargs)
        self.func = func
        self.args = args
        self.kwargs = kwargs

    def run(self) -> None:
        self.func(*self.args, **self.kwargs)


# https://cd.fang.ke.com/loupan/pg2/
url = 'https://cd.fang.ke.com/loupan/pg1/'

headers = {
    'User-Agent': 'SUID=F95AB0753020910A000000005FD5894B; usid=qjXeL6vuvIXNBtS7; SUV=00F67CA675B05AF95FD5894BED57B622; CXID=16DC7D64159AF44190C9DBD6A8CBA2D0; pgv_pvi=2036929536; ssuid=5961950505; IPLOC=CN5101; cd=1620360721&10fe495dec5ffaff30a56d7ba3d3be9a; ABTEST=2|1621595807|v17; browerV=13; osV=1; QIDIANID=ECceFpDPX0vuNitkdt6uyN+xxylU2fkEzooTtoYgtZy1o4aijjVG9IiR2k8m9+/g; sw_uuid=1070034123; SNUID=625271A4CACC09D4D9F4A042CA9F1A67; sst0=35; taspeed=taspeedexist; ld=pyllllllll2KCA3kYJvM9QjHDiikOacSNbxzMylllx9lllllpklll5@@@@@@@@@@',
}


def get_html(_url, _headers={}):
    try:
        response = requests.get(_url, headers=_headers)
        if response.status_code == 200:
            return response.text
        print(response)
    finally:
        response.close()


def analysis(html_text: str, house_list=[]):
    bs = BeautifulSoup(html_text, 'lxml')
    re_obj = re.compile(r'(?s)\s+')
    lis = bs.select('body > div.resblock-list-container.clearfix > ul.resblock-list-wrapper > li')
    for li in lis:
        img = li.select_one('a > img')
        title = li.select_one('div > div.resblock-name')
        position = li.select_one('div > a.resblock-location')
        # counselor = li.select_one('div > div.resblock-agent')
        house_type = li.select_one('div > a.resblock-room')
        feature = li.select_one('div > div.resblock-tag')
        price = li.select_one('div > div.resblock-price')
        house = {
            'name': title.get_text().replace('\n', '').strip(),
            'img': img.attrs['data-original'],
            'position': position.get_text().replace('\n', '').strip(),
            'house_type': house_type.get_text().replace('\n', '').strip(),
            'feature': feature.get_text().replace('\n', '').strip(),
            'price': price.get_text().replace('\n', '').strip()
        }
        house_list.append(house)
        # print(img.attrs['data-original'], img.attrs['alt'])
        # print('='*40)
        # print(re_obj.sub(' ', title.get_text()))
        # print('=' * 40)
        # print(re_obj.sub(' ', position.get_text()))
        # print('=' * 40)
        # print(re_obj.sub(' ', house_type.get_text()))
        # print('=' * 40)
        # # print(counselor)
        # # print('=' * 40)
        # print(re_obj.sub(' ', feature.get_text()))
        # print('=' * 40)
        # print(re_obj.sub(' ', price.get_text()))


if __name__ == '__main__':
    allinfo = []
    # for x in range(1, 101):
    #     url = f'https://cd.fang.ke.com/loupan/pg{x}/'
    #     # new_thread = MyThread(analysis, get_html(url, headers), allinfo)
    #     # new_thread.start()
    #     # new_thread.join()
    #     html_text = get_html(url, headers)
    #     analysis(html_text, allinfo)
    # with open(r'./house.json', 'wb') as f:
    #     f.write(json.dumps(allinfo).encode())

    with open(r'./house.json', 'r', encoding='utf-8') as f:
        allinfo = json.loads(f.read())

        with open(r'./files/houseinfo.csv', 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, ['name', 'img', 'position', 'house_type', 'feature', 'price'])
            writer.writerow({
                'name': 'name',
                'img': 'img',
                'position': 'position',
                'house_type': 'house_type',
                'feature': 'feature',
                'price': 'price'
            })
            writer.writerows(allinfo)

??fengyu

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
day22-网络爬虫2

总结1. 代理"""@Time: 2021/5/26 9:37@Author: 三玖天下第一"""# api = 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=5&expiryDate=0# &format=2&newLine=2'import requestsimport time# http:/
复制链接

扫一扫