cmd模式下
1.建立工程文件:scrapy startproject xici ###xici为自定义名称
2.进入爬虫工程文件:cd xici # 进入xici工程文件
3.建立爬虫: scrapy genspider ips xici.com #爬虫名称自定义为 ips, xici.com为所需要爬的网页(可以在代码中修改)
4.打开创建的爬虫工程文件,进入爬虫目录下:
5.打开middlewares.py,设置user_agent,代码如下:(直接复制粘贴到middlewares.py中,其他的代码别动),该代码来自:http://blog.csdn.net/liyuetao680/article/details/48501481
1 import random 2 from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware 3 4 class RotateUserAgentMiddleware(UserAgentMiddleware): 5 def __init__(self, user_agent=''): 6 self.user_agent = user_agent 7 8 def process_request(self, request, spider): 9 ua = random.choice(self.user_agent_list) 10 if ua: 11 print(ua, '-----------------yyyyyyyyyyyyyyyyyyyyyyyyy') 12 request.headers.setdefault('User-Agent', ua) 13 14 #the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape 15 #for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php 16 user_agent_list = [ 17 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 18 "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 19 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 20 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 21 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 22 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 23 "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 24 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 25 "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 26 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 27 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 28 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 29 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 30 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 31 "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 32 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 33 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 34 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 35 ]
6.middlewares.py设置好后,别忘了去setting.py中激活:
DOWNLOADER_MIDDLEWARES = { 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware':None, 'xici.middlewares.RotateUserAgentMiddleware':400, } ##上面代码中的xici是我的爬虫名称,需要改成自己的。
7.代理IP设置,该代码来自(http://blog.csdn.net/u011781521/article/details/70194744?locationNum=4&fps=1)
在setting.py中添加:
IPPOOL=[
{"ipaddr":"61.129.70.131:8080"},
{"ipaddr":"61.152.81.193:9100"},
{"ipaddr":"120.204.85.29:3128"},
{"ipaddr":"219.228.126.86:8123"},
{"ipaddr":"61.152.81.193:9100"},
{"ipaddr":"218.82.33.225:53853"},
{"ipaddr":"223.167.190.17:42789"}
] #代理IP需要自己去爬取或粘贴复制
在middlewar.py中:
from fj_gov.settings import IPPOOL #这里的fj_gov是爬虫工程文件夹名称
还是在middlewar.py中,在class FjGovSpiderMiddleware(object):类中添加: ##这个类是自动创建的,FJGOV也是工程文件名
def __init__(self,ip=''): self.ip=ip def process_request(self, request, spider): thisip=random.choice(IPPOOL) print("this is ip:"+thisip["ipaddr"]) request.meta["proxy"]="http://"+thisip["ipaddr"]
回到settings中,在DOWNLOADER_MIDDLEWARES 中添加:
'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware':543, 'fj_gov.middlewares.FjGovSpiderMiddleware':125 ##这里的fj_gov.middlewares和FJGOV..都需要改成自己middlewar中的名称
到这里,代理IP设置完成了!
8.设置items.py: items就简单多了,自定义名称=scrapy.Field()就搞定了
1 class FjGovItem(scrapy.Item): 2 # define the fields for your item here like: 3 # name = scrapy.Field() 4 id=scrapy.Field() 5 title=scrapy.Field() 6 a_content=scrapy.Field() 7 department=scrapy.Field() 8 a_time=scrapy.Field() 9 con_person=scrapy.Field() 10 r_content=scrapy.Field() 11 r_time=scrapy.Field() 12
9.单网页爬虫编写(单网页爬取,其基本逻辑是:建立链接库,然后一个个爬):
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from fj_gov.items import FjGovItem ##易错,注意修改名称 4 5 class FjSpider(scrapy.Spider): 6 name = 'fj' 7 allowed_domains = ['fz123465.fuzhou.gov.cn'] 8 start_urls = ['http://fz12345.fuzhou.gov.cn/'] 9 def start_requests(self): 10 reqs=[] 11 for y in range(13,14):# %02d(5,18) 12 for m in range(12,13): #%02d(1,13) 13 for d in range(31,32):#%02d(1,32) 14 if ((m in [4,6,9,11]) and d>30) or (m==2 and d>29): 15 continue 16 for n in range(0,1):#05d(0,10000) 17 req=scrapy.Request("http://fz12345.fuzhou.gov.cn/detail.jsp?callId=FZ%02d%02d%02d%05d"%(y,m,d,n)) 18 reqs.append(req) 19 return reqs 20 21 def parse(self, response): 22 item=FjGovItem() #易错 ,注意修改名称 23 try: #关于xpath和css的知识,要自行百度咯 24 item["id"]=response.xpath("//div[@class='detail']//tr[1]/td[2]/text()").extract()[0] 25 item["title"]=response.xpath("//div[@class='detail']//tr[2]/td[2]/text()")[0].extract() 26 item["a_content"]="".join(response.xpath("//div[@class='detail']//tr[3]/td[2]/text()")[:-3].extract()) 27 item["department"]=response.xpath("//div[@class='detail']//tr[4]//span[1]/text()")[0].extract() 28 item["a_time"]=response.xpath("//div[@class='detail']//tr[7]/td[4]/text()")[0].extract().strip() 29 item["con_person"]=response.xpath("//div[@class='detail']//tr[6]/td[2]/text()")[0].extract().strip() 30 item["r_content"]=response.xpath("//div[@class='detail']//tr[5]/td[2]/text()[1]")[0].extract().strip() 31 if item["r_content"]: 32 item["r_time"]=response.xpath("//div[@class='detail']//tr[5]/td[2]/span[2]/text()")[0].extract().strip() 33 else: 34 item["r_time"]="" 35 yield item #这个别漏了 36 except: 37 pass 38
10.深入爬取(逻辑:先爬一个网站,然后从网站中爬出链接继续爬取!可以自定义好多层)
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from minjian.items import MinjianItem 4 5 class NewsSpider(scrapy.Spider): 6 name = 'news' 7 allowed_domains = ['mju.edu.cn'] 8 start_urls = ['http://www.mju.edu.cn/html/xxyw/.html'] 9 def start_requests(self): 10 reqs=[] 11 for i in range(1,150): 12 req=scrapy.Request("http://www.mju.edu.cn/html/xxyw/%s.html" % i) 13 reqs.append(req) 14 return reqs #和9一样,建立链接列表 15 16 def parse(self, response): 17 news_lists=response.xpath("/html/body/div[2]/div[3]/div[2]/div[2]/ul/li") 18 for news_list in news_lists: 19 item=MinjianItem() 20 item["time"]=news_list.xpath("span[1]/text()").extract() 21 item["title"]=news_list.xpath("span[2]/a/text()").extract() 22 first_url=news_list.xpath("span[2]/a/@href")[0].extract() 23 item["url"]="http://www.mju.edu.cn%s"%first_url #item中要先设置一个 url=scrapy.Field() 24 yield scrapy.Request(url=item["url"],meta={"item":item},callback=self.number_parse,dont_filter=True) #回调函数 25 def number_parse(self,response): 26 item=response.meta["item"] #这个很关键,meta的运用可以百度百度 27 div=response.xpath("//div[@class='detail_main_content']/div") 28 div=div[0] 29 item["number"]=div.xpath("span[1]/text()").extract() 30 yield item #若需要更爬取第三层,这可以再设置一个回调函数,以此类推
结尾:
这部分教程只能用于爬取简单、结构化、反爬措施差的网页。