Program7:WebScraping

读入网站

import requests
from bs4 import BeautifulSoup
r=requests.get("http://www.pyclass.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/",headers={"User-agent":'Mozilla/5.0(X11;Ubuntu;Linux x86_64;rv:61.0)Gecko/20100101 Firefox/61.0'})
soup=BeautifulSoup(r.content)

soup保存网站数据

提取div——class;
提取h4——class;
删除多余的\n和空格;

import requests
from bs4 import BeautifulSoup

r=requests.get("http://www.pyclass.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/",headers={"User-agent":'Mozilla/5.0(X11;Ubuntu;Linux x86_64;rv:61.0)Gecko/20100101 Firefox/61.0'})

soup=BeautifulSoup(r.content)

all = soup.find_all("div",{"class":"propertyRow"})  #得到所有class为propertyrow的div  类似list格式

#len(all)     #10个结果

price=all[0].find_all("h4",{"class":"propPrice"})
price   #得到一个list

all[0].find("h4",{"class":"propPrice"}).text     #得到text  string类型
all[0].find("h4",{"class":"propPrice"}).text.replace("\n","").replace(" ",'') #replace去除\n  以及" "空格

Try_except:

得到每个房子的info;
bed、area、bath等;
从span+class中提取;
try——except防止无数据报错;

for i in all:  #遍历得到所有price
    print(i.find("h4",{"class":"propPrice"}).text.replace("\n","").replace(" ",''))
    print(i.find_all("span",{"class":"propAddressCollapse"})[0].text)
    print(i.find_all("span",{"class":"propAddressCollapse"})[1].text)
    #找到bed个数
    try: #防止error
        print(i.find("span",{"class":"infoBed"}).find("b").text)   #text error 因为有none  try解决
    except:
        print(None)
    
    
    #找到area个数
    try: #防止error
        print(i.find("span",{"class":"infoSqFt"}).find("b").text)   #text error 因为有none  try解决
    except:
        print(None)
    

    try: #防止error
        print(i.find("span",{"class":"infoValueFullBath"}).find("b").text)   #text error 因为有none  try解决
    except:
        print(None)
    

    try: #防止error
        print(i.find("span",{"class":"infoValueHalfBath"}).find("b").text)   #text error 因为有none  try解决
    except:
        print(None)
    print(" ")
    #print(i.find_all("span",{"class":"infoBed"}))

添加代码,zip遍历两个list,得到feature信息
若增加条件判断,可得到特定条件的feature信息;

#得到feature 信息
    for column_group in i.find_all("div",{"class":"columnGroup"}):
        #print(column_group)
        #继续iterator 
        for feature_group, feature_name in zip(column_group.find_all("span",{"class":"featureGroup"}),column_group.find_all("span",{"class":"featureName"})):      #zip遍历两个list
            #print(feature_group.text,feature_name.text)
            if "Lot Size" in feature_group.text:  #若 feature_group中存在Lot size属性,输出该属性的name
                print(feature_name.text)

将数据放入DataFrame中

建立数组,存放字典;
字典为 feature:value

#加入dictionary
#dictionary保存在list中
l=[]
for i in all:  #遍历得到所有price
    d={}
    
    d["Address"]=i.find_all("span",{"class":"propAddressCollapse"})[0].text
    d["Locality"]=i.find_all("span",{"class":"propAddressCollapse"})[1].text
    d["Price"]=i.find("h4",{"class":"propPrice"}).text.replace("\n","").replace(" ",'')
    #找到bed个数
    try: #防止error
        d["Beds"]=i.find("span",{"class":"infoBed"}).find("b").text   #text error 因为有none  try解决
    except:
        d["Beds"]=None
    
    
    #找到area个数
    try: #防止error
        d["Area"]=i.find("span",{"class":"infoSqFt"}).find("b").text   #text error 因为有none  try解决
    except:
        d["Area"]=None
    

    try: #防止error
        d["Full Baths"]=i.find("span",{"class":"infoValueFullBath"}).find("b").text   #text error 因为有none  try解决
    except:
        d["Full Baths"]=None
    

    try: #防止error
        d["Half Baths"]=i.find("span",{"class":"infoValueHalfBath"}).find("b").text   #text error 因为有none  try解决
    except:
        d["Half Baths"]=None
        
    
    
   #得到feature 信息
    for column_group in i.find_all("div",{"class":"columnGroup"}):
        #print(column_group)
        #继续iterator 
        for feature_group, feature_name in zip(column_group.find_all("span",{"class":"featureGroup"}),column_group.find_all("span",{"class":"featureName"})):      #zip遍历两个list
            #print(feature_group.text,feature_name.text)
            if "Lot Size" in feature_group.text:  #若 feature_group中存在Lot size属性,输出该属性的name
                d["Lot Size"]=feature_name.text
    #得到dictionary
    l.append(d)
    #print(i.find_all("span",{"class":"infoBed"}))

将字典转为DataFrame:
to_csv保存为csv格式的文件;

import pandas
df=pandas.DataFrame(l)
df.to_csv("out_put.csv")  #转为csv格式 可用excel打开

进行翻页抓取

requests.get得到网页html代码;
soup将代码格式化;
find_all得到代码中的指定标签(类似list类型);
page_nr为页码数;

import requests
from bs4 import BeautifulSoup

r=requests.get("http://www.pyclass.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/",headers={"User-agent":'Mozilla/5.0(X11;Ubuntu;Linux x86_64;rv:61.0)Gecko/20100101 Firefox/61.0'})
soup=BeautifulSoup(r.content)
all = soup.find_all("div",{"class":"propertyRow"})  #得到所有class为propertyrow的div  类似list格式
#price=all[0].find_all("h4",{"class":"propPrice"})
#得到一个list
#all[0].find("h4",{"class":"propPrice"}).text     #得到text  string类型
#all[0].find("h4",{"class":"propPrice"}).text.replace("\n","").replace(" ",'') #replace去除\n  以及" "空格
page_nr=soup.find_all("a",{"class":"Page"})[-1].text

翻页抓取:
通过for循环得到每次的url;
每次传入url得到all;
对all进行遍历抓取;

#加入dictionary
#dictionary保存在list中
l=[]
base_url="http://www.pyclass.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/t=0&s="
for page in range(0,int(page_nr)*10,10):  #每次+10
    print(base_url+str(page)+".html")
    r=requests.get(base_url+str(page)+".html",headers={"User-agent":'Mozilla/5.0(X11;Ubuntu;Linux x86_64;rv:61.0)Gecko/20100101 Firefox/61.0'})
    c=r.content
    soup=BeautifulSoup(c,"html.parser")
    #print(soup.prettify())
    all = soup.find_all("div",{"class":"propertyRow"})  #得到所有class为propertyrow的div  类似list格式
    
    for i in all:  #遍历得到所有price
        d={}

        d["Address"]=i.find_all("span",{"class":"propAddressCollapse"})[0].text
        try:
            d["Locality"]=i.find_all("span",{"class":"propAddressCollapse"})[1].text
        except:
            d["Locality"]=None
        d["Price"]=i.find("h4",{"class":"propPrice"}).text.replace("\n","").replace(" ",'')
        #找到bed个数
        try: #防止error
            d["Beds"]=i.find("span",{"class":"infoBed"}).find("b").text   #text error 因为有none  try解决
        except:
            d["Beds"]=None


        #找到area个数
        try: #防止error
            d["Area"]=i.find("span",{"class":"infoSqFt"}).find("b").text   #text error 因为有none  try解决
        except:
            d["Area"]=None


        try: #防止error
            d["Full Baths"]=i.find("span",{"class":"infoValueFullBath"}).find("b").text   #text error 因为有none  try解决
        except:
            d["Full Baths"]=None


        try: #防止error
            d["Half Baths"]=i.find("span",{"class":"infoValueHalfBath"}).find("b").text   #text error 因为有none  try解决
        except:
            d["Half Baths"]=None



       #得到feature 信息
        for column_group in i.find_all("div",{"class":"columnGroup"}):
            #print(column_group)
            #继续iterator 
            for feature_group, feature_name in zip(column_group.find_all("span",{"class":"featureGroup"}),column_group.find_all("span",{"class":"featureName"})):      #zip遍历两个list
                #print(feature_group.text,feature_name.text)
                if "Lot Size" in feature_group.text:  #若 feature_group中存在Lot size属性,输出该属性的name
                    d["Lot Size"]=feature_name.text
        #得到dictionary
        l.append(d)
       

保存至csv文件

#加入 numpy array
#将l加入dataframe
import pandas
df=pandas.DataFrame(l)
df.to_csv("out_put.csv")  #转为csv格式 可用excel打开

完整代码

import requests
from bs4 import BeautifulSoup
import pandas

#加入dictionary
#dictionary保存在list中
l=[]
base_url="http://www.pyclass.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/t=0&s="
for page in range(0,int(page_nr)*10,10):  #每次+10
    print(base_url+str(page)+".html")
    r=requests.get(base_url+str(page)+".html",headers={"User-agent":'Mozilla/5.0(X11;Ubuntu;Linux x86_64;rv:61.0)Gecko/20100101 Firefox/61.0'})
    c=r.content
    soup=BeautifulSoup(c,"html.parser")
    #print(soup.prettify())
    all = soup.find_all("div",{"class":"propertyRow"})  #得到所有class为propertyrow的div  类似list格式
    
    for i in all:  #遍历得到所有price
        d={}

        d["Address"]=i.find_all("span",{"class":"propAddressCollapse"})[0].text
        try:
            d["Locality"]=i.find_all("span",{"class":"propAddressCollapse"})[1].text
        except:
            d["Locality"]=None
        d["Price"]=i.find("h4",{"class":"propPrice"}).text.replace("\n","").replace(" ",'')
        #找到bed个数
        try: #防止error
            d["Beds"]=i.find("span",{"class":"infoBed"}).find("b").text   #text error 因为有none  try解决
        except:
            d["Beds"]=None


        #找到area个数
        try: #防止error
            d["Area"]=i.find("span",{"class":"infoSqFt"}).find("b").text   #text error 因为有none  try解决
        except:
            d["Area"]=None


        try: #防止error
            d["Full Baths"]=i.find("span",{"class":"infoValueFullBath"}).find("b").text   #text error 因为有none  try解决
        except:
            d["Full Baths"]=None


        try: #防止error
            d["Half Baths"]=i.find("span",{"class":"infoValueHalfBath"}).find("b").text   #text error 因为有none  try解决
        except:
            d["Half Baths"]=None



       #得到feature 信息
        for column_group in i.find_all("div",{"class":"columnGroup"}):
            #print(column_group)
            #继续iterator 
            for feature_group, feature_name in zip(column_group.find_all("span",{"class":"featureGroup"}),column_group.find_all("span",{"class":"featureName"})):      #zip遍历两个list
                #print(feature_group.text,feature_name.text)
                if "Lot Size" in feature_group.text:  #若 feature_group中存在Lot size属性,输出该属性的name
                    d["Lot Size"]=feature_name.text
        #得到dictionary
        l.append(d)
    #l
df=pandas.DataFrame(l)
df.to_csv("out_put.csv")  #转为csv格式 可用excel打开
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值