python爬取网页信息，用正则达到快刀斩乱麻的效果-CSDN博客

本文链接：https://blog.csdn.net/bq_cui/article/details/92807245
# -*- coding: utf-8 -*-
"""
Created on Fri Jun 14 17:37:44 2019

@author: User
"""

import re
import sys
from bs4 import BeautifulSoup       #beautifulsoup4库使用时是简写的bs4
import requests
import numpy as np
#import string


header={  
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'  
}

i_pg = 0
area_dtype=np.dtype([('bianhao',np.str_,50),
                     ('quyu',np.str_,30),
                     ('riqi',np.str_,30),
                     ('nian',np.str_,30),
                     ('yue',np.str_,30),
                     ('ri',np.str_,30),
                     ('zhuangtai',np.str_,30),  
                     ('mianji',np.str_,30),                     
                     ('jiage',np.str_,30),
                     ('yongtu',np.str_,30)
                     ])

global array_area_all
array_area_all = np.array([('title', 'quyu',
                      'riqi', 'nian',
                      'yue', 'ri',
                      'zhuangtai', 'mianji', 
                      'jiage', 'yongtu')], dtype = area_dtype)

def get_page(url):
    global i_pg
    i_pg += 1
    print('页：', str(i_pg))
#    print(url)
    
    if i_pg > 50:
        sys.exit()
        
    try:
        response = requests.get(url, timeout = 30, headers=header)
        
        # 如果状态码不是200 则应发HTTOError异常
        response.raise_for_status()
        # 设置正确的编码方式
        response.encoding = response.apparent_encoding

        soup = BeautifulSoup(response.text, 'html.parser')
        result_li = soup.find_all(class_=re.compile("rich-table-row.*?"))        
#        print('result_li:',result_li) 
        
        j = 0
#        #处理当前页面房源链接
        for row_text in result_li:

#            这里每个列表页只处理前4个房源链接，是为了家加快调试
            j = j + 1

            # 编号
            title_str = row_text.find('td',{'id':re.compile("j_id46:\d+:j_id92")})
            global title
            title = title_str.text.lstrip()
#            print('编号：',title)
            # 区域
            quyu_str = row_text.find('td',{'id':re.compile("j_id46:\d+:j_id98")})
            global quyu
            quyu = quyu_str.text.lstrip()
#            print('区域：',quyu)
            # 成交价
            chengjiaojia_str = row_text.find('td',{'id':re.compile("j_id46:\d+:j_id107")})
            global chengjiaojia
            chengjiaojia = chengjiaojia_str.text.lstrip().replace('万元','',1)
            chengjiaojia = my_strip(chengjiaojia)
            if len(chengjiaojia) == 0:
                chengjiaojia = '0'
#            print('成交价：',chengjiaojia)            
            # 成交时间
            chengjiao_date_str = row_text.find('td',{'id':re.compile("j_id46:\d+:j_id116")})
            global chengjiao_date
            chengjiao_date = chengjiao_date_str.text.lstrip()
            
            #年
#            nian_str = chengjiao_date.find('td',{'id':re.compile("j_id46:\d+:j_id116")})
            global nian
            nian = chengjiao_date[0:4]
#            print('年:', nian)
            
            global yue
            yue = chengjiao_date[5:7]
#            print('月:', yue)  
            
            global ri
            ri = chengjiao_date[8:10]
#            print('日:', ri)            
            
#            print('成交时间：',chengjiao_date)             
            # 成交状态
            chengjiao_state_str = row_text.find('td',{'id':re.compile("j_id46:\d+:j_id119")})
            global chengjiao_state
            chengjiao_state = chengjiao_state_str.text.lstrip()
#            print('成交状态：',chengjiao_state) 
            

            
            detail_href = row_text.find('a', {'id': re.compile("j_id46:\d+:j_id124")})
            detail_url = 'http://****' + detail_href.attrs['href']
            get_page_detail(detail_url)
            
            # 新的数组
            global area_new
    
            area_new = np.array([(title, quyu,
                      chengjiao_date, nian,
                      yue,ri,
                      chengjiao_state,
                      area, chengjiaojia,
                      use_1)], dtype = area_dtype)     
    
#            print('area_new:', area_new)
    
            # 数组合并
            global array_area_all
            array_area_all = np.vstack((array_area_all, area_new))    

        # 下一页的爬取
        result_next_page = 'http://****?firstResult='+str((i_pg - 1) * 20) +'&priceUnit=TotalPrice&logic=and'
        
        if len(result_next_page) != 0:
        # 函数进行递归
            get_page(result_next_page)            
        else:
            print('没有下一页了')        
      
        return response.text
    except:
        return '产生异常!'
 
 
#进行字符串中空格，换行，tab键的替换及删除字符串两边的空格删除 
def my_strip(s):
    return str(s).replace(" ", "").replace("\n", "").replace("\t", "").strip()

#由于频繁进行BeautifulSoup的使用，封装一下
def my_Beautifulsoup(response):
    return BeautifulSoup(str(response), 'html.parser')
 
# 详细页面的爬取
def get_page_detail(url):
    response = requests.get(url, headers=header)
#    print("9999999")
 
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
#        宗地面积
        area_all = soup.find(attrs={'id': 'j_id242'})
        area_2 = area_all.find_all('span', {'class': 'layout'})[1]
        global area
        area = area_2.text.lstrip().replace('平方米','',1)
        area = my_strip(area)
        if len(area) == 0:
            area = '0'        
#        print('面积:'+ area)      
  
#        用途
        use_all = soup.find(attrs={'id': 'j_id267'})
        use_2 = use_all.find_all('span', {'class': 'layout'})[1]
        global use_1
        use_1 = use_2.text.lstrip()
        use_1 = my_strip(use_1)
#        print('用途:'+ use_1)       

# =============================================================================
        
get_page('http://****')  
print(array_area_all)
np.savetxt('data\\zl.csv', array_area_all, delimiter=',',
           fmt="%s")