python爬虫爬取58同城租房信息(使用动态IP)输出Excel文件

python爬虫爬取58同城信息(使用动态IP)


新手,为了做一个数据分析,搞了几天,终于搞出来了,大家可以给点意见啊。

# coding=utf-8
import sys
import csv
import time
from importlib import reload
import xlwt
import requests
from bs4 import BeautifulSoup
from goto import with_goto
reload(sys)
# 请求头设置
#使用win自带的浏览器的agent
User_Agent = 'your-agent'
headers = {
    'User-Agent': User_Agent,
}
def download(url):
     File = open('filename', 'w')
     excel = xlwt.Workbook(encoding='utf-8')
     sheet = excel.add_sheet('shee1')
     sheet.write(0, 0, '标题')
     sheet.write(0, 1, '房子')
     sheet.write(0, 2, '大小')
     sheet.write(0, 3, '地址1')
     sheet.write(0, 4, '地址2')
     sheet.write(0, 5, '其他')
     sheet.write(0, 6, '价格')
     sheet.write(0, 7, '地区')
     sheet.write(0, 8, '其他2')
     db_data = requests.get(url)
     soup = BeautifulSoup(db_data.text, 'lxml')
     #url1 = soup.select()
     #print(soup)
     titles = soup.select(
       'body > div.list-wrap > div.list-box > ul > li > div.des > h2 > a:nth-of-type(1)')
     #print(type(titles[0].spilt('\"')))
     #print(titles)
     houses = soup.select('body > div.list-wrap > div.list-box > ul > li > div.des > p.room')
     oneaddresss = soup.select(
         'body > div.list-wrap > div.list-box > ul > li > div.des > p.infor > a:nth-of-type(1)')
     twoaddresss = soup.select(
        'body > div.list-wrap > div.list-box > ul > li > div.des > p.infor > a:nth-of-type(2)')
     additions = soup.select(
         'body > div.list-wrap > div.list-box > ul > li > div.des > p.infor')
     #print(additions[0])
     prices = soup.select(
        'body > div.list-wrap > div.list-box > ul > li > div.list-li-right > div.money > b')
     #print(houses[0])
     #print(type(str(titles[0].string).replace(' ', '').replace('\n', '')))
     '''
     for i in range(100):
         print(i)
         print(addition[0].get_text().split(' ')[i].replace(' ', '').replace("\n", ""))
     '''
     count = 1
     ssss = 5
     for title, house, oneaddress, twoaddress, addition, price in zip(titles, houses, oneaddresss, twoaddresss, additions, prices):

        data = [
           (
                str(title.string).replace(' ', '').replace('\n', ''),
                house.get_text().split(' ')[0].replace(' ', '').replace("\n", ""),
                house.get_text().split(' ')[20].replace(' ', '').replace("\n", "").replace("\xa0", ""),
                oneaddress.get_text().replace(' ', '').replace("\n", ""),
                twoaddress.get_text().replace(' ', '').replace("\n", ""),
                addition.get_text().split(' ')[64].replace(' ', '').replace("\n", ""),
                price.get_text().replace(' ', '').replace("\n", "")
            )
        ]
        sheet.write(count, 0, str(title.string).replace(' ', '').replace('\n', ''))
        sheet.write(count, 1, house.get_text().split(' ')[0].replace(' ', '').replace("\n", ""))
        sheet.write(count, 2, house.get_text().split(' ')[20].replace(' ', '').replace("\n", "").replace("\xa0", ""))
        sheet.write(count, 3, oneaddress.get_text().replace(' ', '').replace("\n", ""))
        sheet.write(count, 4, twoaddress.get_text().replace(' ', '').replace("\n", ""))
        sheet.write(count, 5, addition.get_text().split(' ')[64].replace(' ', '').replace("\n", ""))
        sheet.write(count, 6, price.get_text().replace(' ', '').replace("\n", ""))
        url1 = str(title).split('\"')[3]
        time.sleep(5)
        try:
            db_data1 = requests.get(url1, headers=headers)
        except:
            time.sleep(10)
            try:
                db_data1 = requests.get(url1, headers=headers)
            except:
                time.sleep(35)
                try:
                    db_data1 = requests.get(url1, headers=headers)
                except:
                    excel.save('filename.xls')
        soup1 = BeautifulSoup(db_data1.text, 'lxml')
        #print(soup1)
        #print(soup1)
        pos = soup1.select('body > div.houseInfo > ul >li')
        try:
            print(pos[2].get_text().split(' ')[28].replace(' ', '').replace("\n", ""))
        except:
            excel.save('filename.xls')
        sheet.write(count, 7, pos[2].get_text().split(' ')[28].replace(' ', '').replace("\n", ""))
        pos2 = soup1.select('body > div.configure > div.fang-detail > dl> dd> ul >li')
        print(pos2)
        stra = ''
        if len(pos2):
            for p in pos2:
                if (len(str(p)) < 25):
                    print(p.get_text().replace(' ', '').replace("\n", ""))
                    stra = stra + ',' + p.get_text().replace(' ', '').replace("\n", "")
                # print((str(p)))
                # print(len(str(p)))
            pos3 = soup1.select('body > div.configure > ul >li')
            for q in pos3:
                if (len(str(q)) < 25):
                    print(q.get_text().replace(' ', '').replace("\n", ""))
                    stra = stra + ',' + q.get_text().replace(' ', '').replace("\n", "")
            print(stra)
            sheet.write(count, 8, stra)
        else:
            pos2 = soup1.select('body > div.configure > ul >li')
            print(pos2)
            for p in pos2:
                if (len(str(p)) < 25):
                    print(p.get_text().replace(' ', '').replace("\n", ""))
                    print(len(str(p)))
                    stra = stra + ',' + p.get_text().replace(' ', '').replace("\n", "")
            print(stra)
            sheet.write(count, 8, stra)
        #print(type(house.get_text().split(' ')[0].replace(' ', '').replace("\n", "")))
        #print(data)
        #print("已经抓取" + str(count) + "条数据")
        count += 1
        print(count)
        time.sleep(ssss)
     excel.save('filename.xls')




if __name__ == '__main__':
       download("https://cd.58.com/chuzu/pn6/?PGTID=0d3090a7-0006-6d54-6db2-a757b630de63&ClickID=2")


由于58同城的反扒机制,使用自己的IP只能爬取几个数据就被封了,建议使用动态IP。
话不多说,直接上图。在这里插入图片描述

  • 2
    点赞
  • 24
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值