直接上代码,第一种方式获取不到数据。
#.*-coding:utf-8-*-
import requests
import re
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
from lxml import etree
class spider(object):
# 获取url对应的网页源码
def getsource(self,url):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'}
source = requests.get(url, headers=headers)
return source.text
def getNeedInfo(self,sourceHtml):
selector = etree.HTML(sourceHtml)
print selector
result = etree.tostring(selector)
#print(result.decode("utf-8"))
fd = open("result.txt", "w")
fd.write(result.decode("utf-8"))
fd.close()
html_title = selector.xpath('//a[@class="item"]/div[@class="cover-wp"]/img//@alt')
print html_title
ht