import urllib.request import urllib.error import urllib.response import urllib.parse import time import re pagenum = [] def GetInfo(page): url = 'https://www.douban.com/group/topic/' + str(page) user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = {'User-Agent':user_agent} try: response = urllib.request.urlopen(url) content = response.read().decode('utf-8') pattern = re.compile('<div.*?richtext">.*?<p>(.*?)</p><div.*?image-float-center">',re.S) items = re.findall(pattern,content) for item in items: replacePP = re.compile('</p><p>') info = re.sub(replacePP,"\n",item) print ("----------------------------------------------------------") print (info) print ("----------------------------------------------------------") except urllib.error.URLError as e : if hasattr(e, "code"): print (e.code) if hasattr(e, "reason"): print (e.reason) def GetPageNum(start): url = 'https://www.douban.com/group/beijingzufang/discussion?start=' + str(start) try: response = urllib.request.urlopen(url) content = response.read().decode('utf-8') pattern = re.compile('<td.*?title">.*?<a.*?href=(.*?)title.*?</a>.*?</td>',re.S) items = re.findall(pattern,content) for item in items: #print (item) #print (item[36:-3]) pagenum.append(item[36:-3]) except urllib.error.URLError as e : if hasattr(e, "code"): print (e.code) if hasattr(e, "reason"): print (e.reason) if __name__ == '__main__': print("请输入最新需要条数: ") AllTitles = int(input()) StartPage = int( AllTitles / 25 ) for i in range(1, StartPage+1): GetPageNum(i * 25) for num in pagenum: GetInfo(num)
python爬虫豆瓣租房
最新推荐文章于 2024-01-17 15:28:54 发布