\
# -*-coding:utf-8 -*-
2 import urllib2
3 import urllib
4 import re
5 import time
6 import thread
7
8 page = 113566835
9 url = 'https://www.douban.com/group/topic/' + str(page)
10 user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
11 headers = {'User-Agent':user_agent}
12 try:
13 request = urllib2.Request(url,headers = headers)
14 response = urllib2.urlopen(request)
15 content = response.read().decode('utf-8')
16 pattern = re.compile('<div.*?richtext">.*?<p>(.*?)</p><div.*?image-float-center">',re.S)
17 items = re.findall(pattern,content)
18 for item in items:
19 replacePP = re.compile('</p><p>')
20 info = re.sub(replacePP,"\n",item)
21 print info
22 except urllib2.URLError, e:
23 if hasattr(e, "code"):
24 print e.code
25 if hasattr(e, "reason"):
26 print e.reason