爬取网站内容实例1

最新推荐文章于 2024-06-02 21:08:37 发布

岁月如梭518

最新推荐文章于 2024-06-02 21:08:37 发布

阅读量290

点赞数

分类专栏： python 爬虫

本文链接：https://blog.csdn.net/weixin_47476051/article/details/105873531

版权

python 同时被 2 个专栏收录

9 篇文章 0 订阅

订阅专栏

爬虫

8 篇文章 2 订阅

订阅专栏

简介

对某房源网站进行信息抓取

流程及思路

一是请求网页，获取HTML代码。
二是解析代码获取房源信息，包括地址，每日价格等等。
三是下载这些数据并依此保存在excel里面。

# _*_ coding: utf-8 _*_
import requests #导入网页请求库
import re #导入正则表达式库
from bs4 import BeautifulSoup #导入网页解析库
import xlwt
import xlrd
import parser
import os
from xlutils.copy import copy

def start_requests(url):
    headers = {
        'User - Agent': 'Mozilla / 5.0(Windows NT 6.1;WOW64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 63.0.3239.132 Safari / 537.36'
    }
    response = requests.get(url, headers=headers)

   # print(response.status_code)#
    return response.content.decode()
#

def get_info(url):
    global count,r_xls
    text = start_requests(url)
    soup = BeautifulSoup(text, 'lxml')
    titles=soup.select('div.pho_info>h4>em')
    #print(titles[0].string.strip())
    addresses=soup.select('span.pr5')
    #print(addresses[0].string.strip())
    prices = soup.select('#pricePart >div> span')
   # print(prices[0].string.strip())

    for title,address,price in zip(titles,addresses,prices):
        data={
            '房源{}'.format(count): title.string.strip(),
            '地址': address.string.strip(),
            '价格': price.string.strip()
            }


       # print(data)#打印房源信息

        r_xls = xlrd.open_workbook('xian.xls')
        excel=copy(r_xls)
        table=excel.get_sheet(0)
        row=count
        table.write(0, 0, '序号')
        table.write(0, 1, '房源')
        table.write(0, 2, '地址')
        table.write(0, 4, '价格')
        table.write(0, 3, '链接')
        table.write(row, 0, row)
        table.write(row, 1, title.string.strip())
        table.write(row, 2, address.string.strip())
        table.write(row, 4, int(price.string.strip()))
        table.write(row, 3, url)
        excel.save('xian.xls')
        #将房源信息导入excel中
        count += 1


#获取所有的住房链接
def get_links(url):
    text= start_requests(url)
    soup=BeautifulSoup(text,'lxml')
    links=soup.select('#page_list>ul>li>a')
    for link in links:
       # print(link['href'])#获取所有的住房链接
        get_info(link['href'])#调用函数获取链接中房源的信息

#主调函数入口
if __name__ == '__main__':
    count = 1#声明一个全局变量，记录房源数量
    workbook = xlwt.Workbook(encoding='utf-8')
    sheet1 = workbook.add_sheet('西安')
    #
    workbook.save('xian.xls')
    urls =['http://xa.xiaozhu.com/search-duanzufang-p{}-0/'.format(number) for number in range(1,5)]
   # print(urls)

    for url in urls:
        print(url)
        get_links(url)
#