python爬虫豆瓣租房

最新推荐文章于 2024-01-17 15:28:54 发布

Larsongo

最新推荐文章于 2024-01-17 15:28:54 发布

阅读量1.2k

点赞数 1

分类专栏： python

本文链接：https://blog.csdn.net/lisheninasiainfo/article/details/80396301

版权

python 专栏收录该内容

9 篇文章 0 订阅

订阅专栏

import urllib.request
import urllib.error
import urllib.response
import urllib.parse
import time
import re

pagenum = []

def GetInfo(page):
    url = 'https://www.douban.com/group/topic/' + str(page)
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = {'User-Agent':user_agent}
    try:
        response = urllib.request.urlopen(url)
        content = response.read().decode('utf-8')
        pattern = re.compile('<div.*?richtext">.*?<p>(.*?)</p><div.*?image-float-center">',re.S)
        items = re.findall(pattern,content)
        for item in items:
            replacePP = re.compile('</p><p>')
            info = re.sub(replacePP,"\n",item)
            print ("----------------------------------------------------------")
            print (info)
            print ("----------------------------------------------------------")
    except urllib.error.URLError as e :
        if hasattr(e, "code"):
            print (e.code)
        if hasattr(e, "reason"):
            print (e.reason)


def GetPageNum(start):
    url = 'https://www.douban.com/group/beijingzufang/discussion?start='  + str(start)
    try:
        response = urllib.request.urlopen(url)
        content = response.read().decode('utf-8')
        pattern = re.compile('<td.*?title">.*?<a.*?href=(.*?)title.*?</a>.*?</td>',re.S)
        items = re.findall(pattern,content)
        for item in items:
            #print (item)
            #print (item[36:-3])
            pagenum.append(item[36:-3])
    except urllib.error.URLError as e :
        if hasattr(e, "code"):
            print (e.code)
        if hasattr(e, "reason"):
            print (e.reason)

if __name__ == '__main__':
    print("请输入最新需要条数： ")
    AllTitles = int(input())
    StartPage =  int( AllTitles / 25 )
    for i in range(1, StartPage+1):
        GetPageNum(i * 25)
    for num in pagenum:
        GetInfo(num)

Larsongo

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
1
评论
python爬虫豆瓣租房

import urllib.requestimport urllib.errorimport urllib.responseimport urllib.parseimport timeimport repagenum = []def GetInfo(page): url = 'https://www.douban.com/group/topic/' + str(page)...
复制链接

扫一扫