python3爬虫最全基础知识实例练习，数据列表，抓取图片链接，获取HTML解析，获取下一页的URL和网页内容，数据写入excel（霸霸看了都说好）

最新推荐文章于 2021-10-06 16:00:27 发布

霸霸最棒

最新推荐文章于 2021-10-06 16:00:27 发布

阅读量210

点赞数

分类专栏： python基础文章标签： python

本文链接：https://blog.csdn.net/pujun1201/article/details/119874197

版权

python基础专栏收录该内容

4 篇文章 0 订阅

订阅专栏

代码展示

用到最基本的操作知识，请求，html解析，抓取图片链接，获取下一页，数据写入excel，基本爬取网页的流程都有所涉猎

# 请求
import requests
# html解析
from bs4 import BeautifulSoup
# 正则
import re
# 写入Excel文件的扩展工具
import xlwt

# 图片链接
itemImg = re.compile(r'src="https://pic4.ajkimg.com/display/58ajk/(.*).jpg"')
# 楼盘
itemName = re.compile(r'<span class="items-name">(.*)</span>')
# 地址
itemAddress = re.compile(r'<span class="list-map" target="_blank">(.*)</span>')
# 价格
itemPrice = re.compile(r'<p class="price">(.*)</p>')
# 周边价格
itemRim = re.compile(r'<p class="favor-tag around-price">(.*)</p>', re.S)
# 面积
itemArea = re.compile(r'<span class="building-area">(.*)</span>')
# 状态
itemSale = re.compile(
    r'<i class="status-icon (soldout|onsale|forsale)">(.*)</i>')
# 类型
itemType = re.compile(r'<i class="status-icon wuyetp">(.*)</i>')
# 环境
itemContext = re.compile(r'<span class="tag">(.*)</span>')
# 户室
itemHouseType = re.compile(r'<span>(\d*)室</span>', re.S)


# 写入xls表格
def saveData(list, title, savepath):
    print('save...')
    # 创建workboos对象
    book = xlwt.Workbook(encoding="utf-8", style_compression=0)
    # 创建工作表
    sheet = book.add_sheet('58同城琼海新房', cell_overwrite_ok=True)

    for i in range(0, len(title)):
        # 列名
        sheet.write(0, i, title[i])

    for i in range(0, len(list)):
        print('第%d条' % i)
        data = list[i]
        for j in range(0, len(title)):
            # 数据
            sheet.write(i + 1, j, data[j])

    # 保存
    book.save(savepath)


# 分页获取数据
def baseUrl(index):
    # 浏览器信息，告知服务器是浏览器请求
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
        'Chrome/51.0.2704.63 Safari/537.36'
    }
    # 网页链接
    url = 'https://qh.58.com/xinfang/loupan/all/p' + str(
        index) + '/?PGTID=0d0091a8-0279-8d26-f840-a5e5990146b3&ClickID=1'
    print(url)
    res = requests.get(url, headers=headers)
    res.encoding = "utf-8"
    soup = BeautifulSoup(res.text, "html.parser")
    
	# 本地HTML测试
    # soup = BeautifulSoup(open('index.html', encoding='utf-8'),
    #                      features='html.parser')
    return soup


# 爬取网页
def getData(pageIndex):
    dataList = []
    # 分页
    for i in range(1, pageIndex):
        html = baseUrl(i)
        print('第%d页' % i)
        # 获取页面demo处理
        for item in html.find_all('div', attrs={'rel': 'nofollow'}):
            data = []
            item = str(item)
            # 图片
            img = re.findall(itemImg, item)
            if (len(img) != 0):
                img = 'https://pic4.ajkimg.com/display/58ajk/' + str(
                    img[0]) + '.jpg'
            else:
                img = ''
            data.append(img)

            # 楼盘
            name = re.findall(itemName, item)[0]
            data.append(name)

            # 地址
            address = re.findall(itemAddress, item)[0]
            data.append(address)

            # 价格
            price = re.findall(itemPrice, item)
            if (len(price) == 0):
                price = re.findall(itemRim, item)[0]
            else:
                price = price[0]
            price = re.sub('<span>', '', price)
            price = re.sub('</span>', '', price)
            data.append(price)

            # 面积
            area = re.findall(itemArea, item)[0]
            area = re.sub('建筑面积：', '', area)
            data.append(area)

            # 状态
            sale = re.findall(itemSale, item)[0][1]
            data.append(sale)

            # 类型
            type = re.findall(itemType, item)[0]
            data.append(type)

            # 环境
            context = re.findall(itemContext, item)
            if (len(context) != 0):
                res = ''
                for text in context:
                    res += text + ','
                    context = res
                del res
            else:
                context = ''
            data.append(context)

            # 户室
            houseType = re.findall(itemHouseType, item)
            if (len(houseType) != 0):
                res = ''
                for house in houseType:
                    res += house + '室,'
                houseType = res
                del res
            else:
                houseType = ''
            data.append(houseType)

            dataList.append(data)
    return dataList


dataList = getData(3)  # 调用函数获取数据
titles = ('图片链接', '楼盘', '地址', '价格', '面积', '状态', '类型', '环境', '户室')  # 表格标题
saveData(dataList, titles, '58同城琼海新房.xls')  # 创建写入表格
print('完成')