1获取最外部的标签 获取内部的所有的子标签获取标签文本
type_list = response.xpath('//div[@class="book-info"]/p[@class="tag-box"]/span/i/text()').extract()
for type in type_list:
print(type)
2正则去掉所有标签的所有内容 re.compile.sub()
book_img = response.xpath('//div[@class="book-information cf"]/div[@class="book-img"]/a/img/@src').extract()[0]
pattern = re.compile(r'\r',re.S)
book_img = re.sub(pattern,'',book_img)
book_img = 'http:' + book_img
print(book_img)
3/text()获取标签的内容 //text()获取标签以及子标签的内容
detail = response.xpath('//p[@class="intro"]/text()').extract()
for de in detail:
print(de)
item['de'] = [de]
4使用xpath('string(.)')来获取所有的 文本并且连接
content_list = x.xpath('.//div[@class="p_content "]').xpath('string(.)').extract()
# print(content_list)
f = open('xiaoshuo.txt','a',encoding='utf-8')
remove = re.compile('\s')
for x in content_list :
x = re.sub(remove,'',x)
f.write(x)
f.write('\n')