python之爬虫的入门05------实战：爬取贝壳网（用re匹配需要的数据）

本文链接：https://blog.csdn.net/sui_yi123/article/details/83511822

本文介绍使用Python进行网络爬虫的实战案例，通过正则表达式（re）解析贝壳网上的数据，详细讲解如何抓取并处理所需信息。

摘要由CSDN通过智能技术生成

# 第二页：https://hz.zu.ke.com/zufang/pg2
# 第一页：https://hz.zu.ke.com/zufang/pg1


import urllib.request
import random
import re

def user_ip():
    '''使用IP代理'''

    iplist = ['117.191.11.109:8542','186.46.192.110:8177', '39.137.2.214:8882']  # 代理IP地址
    proxy_support = urllib.request.ProxyHandler({'http': random.choice(iplist)})  # 创建一个请求对象，处理http请求，参数是一个字典{'类型'：'代理IP：端口号'}
    opener = urllib.request.build_opener(proxy_support)  # 订制创建一个opener
    urllib.request.install_opener(opener)  # 替代默认opener

def create_request(url,headers):
    '''生成请求request'''
    req = urllib.request.Request(url=url, headers=headers)
    return req

def get_response(req):
    '''得到response回应'''
    response = urllib.request.urlopen(req)
    return response

def get_html(response):
    '''得到HTML页面'''
    html = response.read().decode('utf-8')
    return html

def get_home_img(html):
    # 1.图片地址 #list2存储了所有的图片地址
    list1 = re.findall(r'''data-src\=\"(.*?)\.jpg\"''', html)
    home_img = []
    for i in list1:
        home_img.append(i + '.jpg')
    # print(home_img)
    return home_img

def get_home_name(html):
    home_name = re.findall(r'''<p class="content__list--item--title twoline">
                  <a target="_blank" href=".*">
                    (.*?)                  </a>
                </p>''', html)
    # print(home_name)
    # print(len(home_name))
    return home_name

def chu_kongji(list4):
    list7 = []
    for i in list4:
        list6 = []
        for j in i:
            if j == '':
                continue
            list6.append(j)
        list7.append(list6)
    return list7

def get_home_details(html):
    # 3.详细信息
    home_details = re.findall(r'''<p class="content__list--item--des">