主要代码
import scrapy
from scrapy import Selector
from first.items import TYItem
import re
class TianyaSpider(scrapy.Spider):
name = 'tianya'
allowed_domains=['tianya.cn']
# 对请求的返回进行处理的配置
meta = {
'dont_redirect': True, # 禁止网页重定向
'handle_httpstatus_list': [301, 302] # 对哪些异常返回进行处理
}
cookies={}
header={ 'User-Agent': 'Mozilla / 5.0(X11;Linux x86_64) AppleWebKit /537.36(KHTML, likeGecko) Chrome / 54.0.2840.71Safari / 537.36'}
start_urls=[
"http://bbs.tianya.cn/post-funinfo-7049670-1.shtml",
"http://bbs.tianya.cn/post-16-1632189-1.shtml",
"http://bbs.tianya.cn/post-16-1642094-1.shtml",
]
def get_url(self,pre_url):
url_head = pre_url.split('-')[0:-1]
page = int(pre_url.split('-')[-1].split('.')[0])+