import requests
from lxml import etree
from lxml import html
import os
#from lxml.html import fromstring, tostring
class Spider(object):
def start_requests(self):
response = requests.get("https://tech.sina.com.cn/")
#print(response.encoding)
response.encoding = 'utf-8'
#print(response.text)
xml = etree.HTML(response.text)
tech_tit_list = xml.xpath('//div[@class="tech-news"]/ul/li/a/text()')
tech_src_list = xml.xpath('//div[@class="tech-news"]/ul/li/a/@href')
for tit_list,src_list in zip(tech_tit_list,tech_src_list):
#if os.path.exists(tit_list) == False:
# os.mkdir(tit_list)
#print(tit_list,src_list)
self.next_file(tit_list,src_list)
def next_file(self,tit_list,src_list):
response = requests.get(src_list)
response.encoding = 'utf-8'
xml = etree.HTML(response.text)
last_tile = xml.xpath('//h1[@class="main-title"]/text()')
#content = xml.xpath('//div[@class="article"]/p/text()')
content = "\n".join(xml.xpath('//div[@class="article"]/p/text()'))
#print(last_tile) ,encoding="utf-8"
#print(last_tile,content)
#print(content)
#file_name = last_tile+".txt"
with open('demo.txt',"w",encoding="utf-8") as f:
f.write('\n'+content)
spider = Spider()
spider.start_requests()