#设计原采集
import requests
import re
from lxml import etree
import os
os.makedirs(f'shejy/',exist_ok=True)
#获取页码
def get_page(category):
url = f"http://jy.sccnn.com/category-{category}_1.html"
html = requests.get(url).text
# print(html)
pagesze=r'1(.+?)
pages=re.findall(pagesze,html,re.S)
pageze=r'>›.+?››'
page=re.findall(pageze,pages[0],re.S)
page=page[0]
print(page)
return page
#获取链接列表
def get_urls(category,i):
url=f"http://jy.sccnn.com/category-{category}_{i}.html"
try:
html=requests.get(url).text
#print(html)
urlsze=r'auth1">.+?
urls=re.findall(urlsze,html,re.S)
with open('shejy/spider.txt','a+',encoding='utf-8') as f:
h = con.xpath('//div[@class="singletitle"]/h2/text()')
h = re.sub(r'[\|\/\<\>\:\*\?\\\"]', "_", h) # 剔除不合法字符
os.makedirs(f'shejy/{h}/', exist_ok=True)
con_text = re.findall(con_textze, html, re.S)
text = texts.xpath('string(.)')
with open(f'shejy/{h}/{img_name}', 'wb') as f:
with open('shejy/spider.txt', 'a+', encoding='utf-8') as f:
f.write(f"{img_url}-----访问失败!\n")
dates = '%s%s%s%s%s' % (h, '\n', con_text, '\n', text)
with open(f'shejy/{h}/{h}.txt', 'w+', encoding='utf-8') as f:
with open('shejy/spider.txt','a+',encoding='utf-8') as f: