baseurl = 'http://www.qiushibaike.com/hot/page/2?s=4837277'
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
req = urllib2.Request(baseurl,headers=headers)
myres = urllib2.urlopen(req)
mypage = myres.read().decode("utf-8")
soup = BeautifulSoup(mypage,"lxml")
contentsoup = soup.findAll(name="div",attrs={"class":"content"})
for content in contentsoup:
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
req = urllib2.Request(baseurl,headers=headers)
myres = urllib2.urlopen(req)
mypage = myres.read().decode("utf-8")
soup = BeautifulSoup(mypage,"lxml")
contentsoup = soup.findAll(name="div",attrs={"class":"content"})
for content in contentsoup:
print type(content.get_text())
最近在学Beatifulsoup ,用Beatifulsoup 解析html网页,获取指定标签的值,可以替代正则表达式,不知道为啥这个用.string方法老是返回none,用get_text()就可以了
一般的,用urlopen()获得html 再用Beatifulsoup 分离出自己想要的部分。