爬虫之抓取糗事百科的段子(python3.5环境):
1.下载页面
2.解析(xpath方法)
# -*-coding:utf-8 -*-
import urllib.request
import sys
import io
from lxml import etree
from urllib.parse import urljoin
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030') #改变标准输出的默认编码
def download(originer_url,p):
url=str(originer_url)+str(p)
print(url)
print (p)
#添加header
headers={'User-Agent':r'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)','Connection':'keep-alive'}
#创建opener
opener=urllib.request.build_opener()
opener.addheaders=[headers]
try:
page=opener.open(str(url)).read().dec