更多解释在打开链接,这里使用BeautifulSoup爬取
#!/usr/bin/python
#coding: utf-8
from bs4 import BeautifulSoup
import re, sys, urllib, urllib2
reload(sys)
sys.setdefaultencoding( "utf-8" )
while True:
url = "http://www.qiushibaike.com/hot/page/"
try:
x = int(raw_input(u"请输入一个数字(输入0结束), 荤段子只有35页:"))
except Exception as e:
print e
print u"请输入数字"
continue
if x == 0:
break
url = url + str(x) + "/"
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
try:
html = urllib2.Request(url, headers = headers)
html = urllib2.urlopen(html).read()
html = html.decode("utf-8")
soup = BeautifulSoup(html, "lxml")
items = soup.find_all("div", {"class" : "content"})
sys.stdout.write("\n")
sys.stdout.write(u"第%d页\n" % x)
sys.stdout.write("\n")
for x, item in zip(range(1, len(items) + 1), items):
sys.stdout.write(u"第%d条" % x)
sys.stdout.write("\n")
sys.stdout.write(item.get_text())
sys.stdout.write("\n")
except Exception as e:
print e
print u"出错了,无法链接糗事百科!"
使用类封装
#!/usr/bin/python
#coding: utf-8
import re, sys, urllib, urllib2
from bs4 import BeautifulSoup
class Qiushi_spider(object):
def __init__(self, x):
self.x = x
self.url = "http://www.qiushibaike.com/hot/page/" + str(self.x) + "/"
def find_out(self):
try:
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
html = urllib2.Request(self.url, headers = headers)
html = urllib2.urlopen(html).read()
soup = BeautifulSoup(html, "lxml")
items = soup.find_all("div", {"class", "content"})
sys.stdout.write("\n")
sys.stdout.write(u"第%d页\n" % self.x)
sys.stdout.write("\n")
for num, item in zip(range(1, len(items) + 1), items):
sys.stdout.write(u"第%d条" % num)
sys.stdout.write("\n")
sys.stdout.write(item.get_text())
sys.stdout.write("\n")
except Exception as e:
print e
print u"无法连接到糗事百科,请重新输入"
if __name__ == "__main__":
while True:
try:
x = int(raw_input(u"请输入一个数字(输入0结束), 荤段子只有35页:"))
if x == 0:
break
spider = Qiushi_spider(x)
spider.find_out()
except Exception as e:
print e
print u"输入出错了,请重新输入"