day22-网络爬虫2

总结

1. 代理
  • """
    @Time:  2021/5/26 9:37
    @Author:    三玖天下第一
    """
    
    # api = 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=5&expiryDate=0
    # &format=2&newLine=2'
    
    import requests
    import time
    
    # http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=4&expiryDate=0&format=2&newLine=3
    
    api = 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=4&expiryDate=0&format=2&newLine=3'
    
    
    def get_proxy_ips(api=api):
        response = requests.get(api)
        if response.status_code == 200:
            if response.text[0] == '{':
                print('提取频繁请按照规定频率提取')
                time.sleep(2)
            else:
                return response.text.split('\n')[:-1]
        else:
            print('获取代理失败!')
            time.sleep(3)
            return get_proxy_ips(api)
    
    
    def get_html(url, headers={}, proxies={}, proiesgen=None):
        try:
            response = requests.get(url, headers=headers, proxies=proxies)
            if response.status_code == 200:
                return response.text
            else:
                if proiesgen and not proiesgen.is_update(proies):
                    new_proxies = proiesgen.updatefunc()
                    return get_html(url, headers=headers, proxies=new_proxies)
        except (TimeoutError, requests.RequestException) as e:
            print('请求超时...', e)
            if proiesgen and not proiesgen.is_update(proies):
                new_proxies = proiesgen.updatefunc()
                return get_html(url, headers=headers, proxies=new_proxies)
        finally:
            response.close()
    
    
    if __name__ == '__main__':
        url = 'https://movie.douban.com/top250'
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
        }
    
    
        def ip_generator():
            while True:
                ips = get_proxy_ips()
                if not ips:
                    continue
                for ip in ips:
                    yield ip
    
    
        ipgen = ip_generator()
    
    
        def gen_proxies(gen=ipgen):
            ip = next(gen)
            return {
                'http': ip,
                'https': ip,
            }
    
    
        class ProxiesCls:
    
            def __init__(self):
                self.proxies = gen_proxies()
    
            def update_proxies(self):
                self.proxies = gen_proxies()
                return self.proxies
    
            def is_update(self, old):
                if old == self.proxies:
                    return False
                return True
    
        proies = ProxiesCls()
        html_str = get_html(url, headers=headers, proxies=proies.proxies, proiesgen=proies)
        print(html_str)
    
    
2. bs4的使用
  • """
    @Time:  2021/5/26 11:29
    @Author:    三玖天下第一
    """
    
    import requests
    from bs4 import BeautifulSoup
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
        'cookie': 'MUID=00CF3FB6A11967AD2CAC2FF5A0376609; MUIDB=00CF3FB6A11967AD2CAC2FF5A0376609; _EDGE_S=F=1&SID=0A4B569CE0856D7B2E9146DFE1AB6C6E; _EDGE_V=1; SRCHD=AF=QBRE; SRCHUID=V=2&GUID=BE1781471D3C485EB7054DFDCEF154F7&dmnchg=1; _SS=SID=0A4B569CE0856D7B2E9146DFE1AB6C6E; MUIDV=NU=1; SRCHUSR=DOB=20210526&T=1622000102000&TPC=1622000103000; ipv6=hit=1622003704776&t=4; SRCHHPGUSR=SRCHLANGV2=zh-Hans&BZA=0&BRW=XW&BRH=S&CW=1879&CH=469&DPR=1&UTC=480&DM=3&HV=1622000103&WTS=63757596902',
        'origin': 'https://cn.bing.com'
    }
    
    
    def get_net_data(url):
        try:
            response = requests.get(url, headers)
            if response.status_code == 200:
                return response.text
        finally:
            response.close()
    
    
    def analysis_data(data: str, selector=''):
        # 1. 创建解析器
        bs = BeautifulSoup(data, 'lxml')
    
        # 2. 根据css选择器选择标签
        selector = selector or 'div#b_content'
        result = bs.select(selector)
        print(len(result))      # 1
        print(type(result))     # <class 'bs4.element.ResultSet'>
        print(type(result[0]))  # <class 'bs4.element.Tag'>
        print(result)
    
        # 3. 获取标签内容
        # 1)标签对象.string     -   获取标签的文字类容(如果标签内容中由多个子标签或者同时存在文字和子标签,结果就是None)
        selector = 'div#b_content h1'
        div = bs.select_one(selector)
        print(type(div))        # <class 'bs4.element.Tag'>
        print('div.string:', div.string)    # None
        # 2)标签对象.get_text() -   获取标签的文字内容(如果有子标签,会将子标签中的文字内容一起获取)
        print('div.text:', div.get_text())  # div.text: 没有与此相关的结果: c语言如何查看argv长度
        # 3)标签对象.contents   -   获取标签中的文字和子标签,返回值是列表(文字和子标签分开)
        print('div.contents', type(div.contents), div.contents)     # div.contents <class 'list'> ['没有与此相关的结果: ', <strong>c语言如何查看argv长度</strong>]
    
        # 4. 获取标签属性
        img = bs.select_one('img')
        # 1)标签对象.attrs[属性名]
        print(img)
        print(img.attrs['src'])
    
        # 5. 在指定标签中获取指定子标签
        # 1)指定子标签.select(selector)
        # 标签对象.select(css选择器)   -   获取指定标签中选择器选中的所有标签
        # 标签对象.select_one(css选择器)   -   获取指定标签中选择器选中的第一个标签
        div2 = div.select('span')
        print(div2)
    
    if __name__ == '__main__':
        # https://cn.bing.com/search?q=c%E8%AF%AD%E8%A8%80%E5%A6%82%E4%BD%95%E6%9F%A5%E7%9C%8Bargv%E9%95%BF%E5%BA%A6&qs=n&form=QBRE&sp=-1&pq=c%E8%AF%AD%E8%A8%80%E5%A6%82%E4%BD%95%E6%9F%A5%E7%9C%8Bargv%E9%95%BF%E5%BA%A6&sc=0-13&sk=&cvid=760A7C0047094D159C8B1F6DE7426C96
        url = 'https://cn.bing.com/search?q=c%E8%AF%AD%E8%A8%80%E5%A6%82%E4%BD%95%E6%9F%A5%E7%9C%8Bargv%E9%95%BF%E5%BA%A6&qs=n&form=QBRE&sp=-1&pq=c%E8%AF%AD%E8%A8%80%E5%A6%82%E4%BD%95%E6%9F%A5%E7%9C%8Bargv%E9%95%BF%E5%BA%A6&sc=0-13&sk=&cvid=760A7C0047094D159C8B1F6DE7426C96'
        html_str = get_net_data(url)
        analysis_data(html_str)
    
    
3. csv操作
  • """
    @Time:  2021/5/26 16:56
    @Author:    三玖天下第一
    """
    
    import csv
    
    # 1. 创建一个csv文件对象
    # 1) csv.writer(文件对象)      -   以列表为单位写入一行数据
    with open(r'./files/test.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
    
    # 1.1) 写入数据 -   用列表提供数据来进行写操作
        writer.writerow(['name', 'gender', 'age', 'score'])
        writer.writerows([['张三', '男', 27, 98], ['小酒', '女', 19, 97], ['媛媛', '女', 31, 89]])
    
    # 2) csv.DictReader()  -   以字典为单位写入一行数据
    with open(r'./files/test2.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, ['name', 'gender', 'age', 'score'])
    
        # 第一行内容
        writer.writerow({'name': '姓名', 'age': '年龄', 'gender': '性别', 'score': '分数'})
    
        # 写一行
        writer.writerow({'name': '张三', 'age': '16', 'gender': '男', 'score': '99'})
    
        writer.writerows([{'name': '小九', 'age': '26', 'gender': '女', 'score': '91'}, {'name': '王秀', 'age': '11', 'gender': '女', 'score': '69'}, {'name': '玖就', 'age': '26', 'gender': '女', 'score': '96'}])
    
    
    # 2. 读取csv文件内容
    with open(r'./files/test.csv', 'r', newline='', encoding='utf-8') as f:
        reader = csv.reader(f)
        print(type(reader))
        for x in reader:
            print(x)
    
    with open(r'./files/test2.csv', 'r', newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        print(type(reader))
        for x in reader:
            print(x)
    

作业

"""
@Time:  2021/5/26 15:19
@Author:    三玖天下第一
"""

from bs4 import BeautifulSoup
import requests
import re
import threading
import json
import csv

class MyThread(threading.Thread):
    def __init__(self, func, *args, **kwargs):
        super().__init__(args, kwargs)
        self.func = func
        self.args = args
        self.kwargs = kwargs

    def run(self) -> None:
        self.func(*self.args, **self.kwargs)


# https://cd.fang.ke.com/loupan/pg2/
url = 'https://cd.fang.ke.com/loupan/pg1/'

headers = {
    'User-Agent': 'SUID=F95AB0753020910A000000005FD5894B; usid=qjXeL6vuvIXNBtS7; SUV=00F67CA675B05AF95FD5894BED57B622; CXID=16DC7D64159AF44190C9DBD6A8CBA2D0; pgv_pvi=2036929536; ssuid=5961950505; IPLOC=CN5101; cd=1620360721&10fe495dec5ffaff30a56d7ba3d3be9a; ABTEST=2|1621595807|v17; browerV=13; osV=1; QIDIANID=ECceFpDPX0vuNitkdt6uyN+xxylU2fkEzooTtoYgtZy1o4aijjVG9IiR2k8m9+/g; sw_uuid=1070034123; SNUID=625271A4CACC09D4D9F4A042CA9F1A67; sst0=35; taspeed=taspeedexist; ld=pyllllllll2KCA3kYJvM9QjHDiikOacSNbxzMylllx9lllllpklll5@@@@@@@@@@',
}


def get_html(_url, _headers={}):
    try:
        response = requests.get(_url, headers=_headers)
        if response.status_code == 200:
            return response.text
        print(response)
    finally:
        response.close()


def analysis(html_text: str, house_list=[]):
    bs = BeautifulSoup(html_text, 'lxml')
    re_obj = re.compile(r'(?s)\s+')
    lis = bs.select('body > div.resblock-list-container.clearfix > ul.resblock-list-wrapper > li')
    for li in lis:
        img = li.select_one('a > img')
        title = li.select_one('div > div.resblock-name')
        position = li.select_one('div > a.resblock-location')
        # counselor = li.select_one('div > div.resblock-agent')
        house_type = li.select_one('div > a.resblock-room')
        feature = li.select_one('div > div.resblock-tag')
        price = li.select_one('div > div.resblock-price')
        house = {
            'name': title.get_text().replace('\n', '').strip(),
            'img': img.attrs['data-original'],
            'position': position.get_text().replace('\n', '').strip(),
            'house_type': house_type.get_text().replace('\n', '').strip(),
            'feature': feature.get_text().replace('\n', '').strip(),
            'price': price.get_text().replace('\n', '').strip()
        }
        house_list.append(house)
        # print(img.attrs['data-original'], img.attrs['alt'])
        # print('='*40)
        # print(re_obj.sub(' ', title.get_text()))
        # print('=' * 40)
        # print(re_obj.sub(' ', position.get_text()))
        # print('=' * 40)
        # print(re_obj.sub(' ', house_type.get_text()))
        # print('=' * 40)
        # # print(counselor)
        # # print('=' * 40)
        # print(re_obj.sub(' ', feature.get_text()))
        # print('=' * 40)
        # print(re_obj.sub(' ', price.get_text()))


if __name__ == '__main__':
    allinfo = []
    # for x in range(1, 101):
    #     url = f'https://cd.fang.ke.com/loupan/pg{x}/'
    #     # new_thread = MyThread(analysis, get_html(url, headers), allinfo)
    #     # new_thread.start()
    #     # new_thread.join()
    #     html_text = get_html(url, headers)
    #     analysis(html_text, allinfo)
    # with open(r'./house.json', 'wb') as f:
    #     f.write(json.dumps(allinfo).encode())

    with open(r'./house.json', 'r', encoding='utf-8') as f:
        allinfo = json.loads(f.read())

        with open(r'./files/houseinfo.csv', 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, ['name', 'img', 'position', 'house_type', 'feature', 'price'])
            writer.writerow({
                'name': 'name',
                'img': 'img',
                'position': 'position',
                'house_type': 'house_type',
                'feature': 'feature',
                'price': 'price'
            })
            writer.writerows(allinfo)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
尚硅谷是一个提供在线教育的学习平台,他们提供了Python爬虫的相关教学课程。通过学习这些课程,你可以获得Python基础知识、前端基础知识、计算机网络基础以及爬虫原理等方面的知识。Python基础教学可以参考B站尚硅谷的教学视频,这些视频可以帮助你建立起对Python的基本认识。而前端基础知识包括HTML、CSS和JavaScript的学习,它们分别代表网页的骨架、肌肉和皮肤,只有结合起来才能形成一个完整的网页。此外,你还可以通过学习计算机网络基础来理解HTTP和HTTPS、URL以及网页请求等内容。爬虫原理也是其中的一部分内容,它可以帮助你了解如何通过爬虫程序获取网页数据。所以,如果你对Python爬虫感兴趣,尚硅谷提供的Python爬虫课程可能是一个不错的选择。<span class="em">1</span><span class="em">2</span><span class="em">3</span> #### 引用[.reference_title] - *1* [零基础自学python爬虫笔记Day1——爬虫的基本原理](https://blog.csdn.net/Seyhang/article/details/119420052)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v92^chatsearchT0_1"}}] [.reference_item style="max-width: 50%"] - *2* *3* [【Python爬虫】urllib库——尚硅谷](https://blog.csdn.net/qq_48108092/article/details/126097408)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v92^chatsearchT0_1"}}] [.reference_item style="max-width: 50%"] [ .reference_list ]
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值