开始工作
1、创建项目scrapy startproject loginScrapy
2、创建spider
cd loginScrapy
scrapy genspider zhihu www.zhihu.com
3、改写项目文件在settings中增加
ROBOTSTXT_OBEY = False # 这个不禁用,遵守协议还怎么爬,人家默认不让你爬啊
COOKIES_ENABLED = False
DOWNLOAD_DELAY = 0.25 # 250 ms of delay
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}
3、在middle中增加随机User-Agent
我用的自己维护的代理池,这里就不写随机代理了,直接写随机User-Agent。
在middlewares.py中编写随机头。
class RandomUserAgentMiddleware():
def __init__(self):
self.user_agents = ["Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5",
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5",
"MQQBrowser/25 (Linux; U; 2.3.3; zh-cn; HTC Desire S Build/GRI40;480*800)",
"Mozilla/5.0 (Linux; U; Android 2.3.3; zh-cn; HTC_DesireS_S510e Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (SymbianOS/9.3; U; Series60/3.2 NokiaE75-1 /110.48.125 Profile/MIDP-2.1 Configuration/CLDC-1.1 ) AppleWebKit/413 (KHTML, like Gecko) Safari/413",
"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Mobile/8J2",
"Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.51.22 (KHTML, like Gecko) Version/5.1.1 Safari/534.51.22",
"Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A5313e Safari/7534.48.3",
"Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A5313e Safari/7534.48.3",
"Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A5313e Safari/7534.48.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; SAMSUNG; OMNIA7)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; XBLWP7; ZuneWP7)",
"Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30",
"Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)",
"Mozilla/4.0 (compatible; MSIE 60; Windows NT 5.1; SV1; .NET CLR 2.0.50727)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Opera/9.80 (Windows NT 5.1; U; zh-cn) Presto/2.9.168 Version/11.50",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; TheWorld)"]
def process_request(self, request, spider):
request.headers['User-Agent'] = random.choice(self.user_agents)
并在settings中增加:
settings :
DOWNLOADER_MIDDLEWARES = {
'loginScrapy.middlewares.RandomUserAgentMiddleware': 543,
}
代码的编写
1、修改Spider文件
import logging
class TaobaoSpider(scrapy.Spider):
name = 'zhihu2'
allowed_domains = ['www.zhihu.com/']
start_urls = ['https://www.zhihu.com/signin' ]
# 用户详细信息地址
user_detail = 'https://www.zhihu.com/api/v4/members/{user}?include={include}'
# 用户详细信息中的include
user_include = 'allow_message,is_followed,' \
'is_following,' \
'is_org,is_blocking,' \
'employments,' \
'answer_count,' \
'follower_count,' \
'articles_count,' \
'gender,' \
'badge[?(type=best_answerer)].topics'
# 关注的人地址
follow_detail = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
# 关注的人include
follow_include = 'data[*].answer_count,' \
'articles_count,' \
'gender,' \
'follower_count,' \
'is_followed,' \
'is_following,' \
'badge[?(type=best_answerer)].topics'
# 初始user
start_user = 'sun-lu-66-67'
def __init__(self, *args, **kwargs): # 初始化
self.browser = None
self.cookies = None
super(TaobaoSpider, self).__init__(*args, **kwargs) # 这里是关键
#super(TaobaoSpider, self).__init__() # 传递给父类
def parse(self, response):
# 打印链接,打印网页源代码
# print(response.url)
# print(response.body.decode("utf-8", "ignore"))
# 这里重新定义start_requests方法,注意这里的format用法
yield scrapy.Request(self.user_detail.format(user=self.start_user, include=self.user_include),
callback=self.parse_user,dont_filter=True)
yield scrapy.Request(self.follow_detail.format(user=self.start_user, include=self.follow_include, offset=20, limit=20),
callback=self.parse_follow,dont_filter=True)
#yield scrapy.Request("https://www.zhihu.com/people/sun-lu-66-67/activities", callback=self.get_info, dont_filter=True, )
def get_info(self, response):
print("%%%%%%%%%"*30)
print(response.url)
print(response.body.decode("utf-8", "ignore"))
def parse_user(self, response):
# print("%%%%%%%%%" * 30)
# print('user:%s' % response.text)
"""
解析用户详细信息方法
:param response: 获取的内容,转化为json格式
"""
# 通过json.loads方式转换为json格式
results = json.loads(response.text)
# 引入item类
item = UserItem()
# 通过循环判断字段是否存在,存在将结果存入items中
for field in item.fields:
if field in results.keys():
item[field] = results.get(field)
# 直接返回item
yield item
# 将获取的用户通过format方式组合成新的url,调用callback函数交给parse_follow方法解析
yield scrapy.Request(self.follow_detail.format(user=results.get('url_token'),
include=self.follow_include, offset=0, limit=20),
callback=self.parse_follow,dont_filter=True)
def parse_follow(self, response):
# print("##########" * 30)
# print('follow:%s' % response.text)
"""
解析关注的人列表方法
"""
# 格式化response
results = json.loads(response.text)
# 判断data是否存在,如果存在就继续调用parse_user解析用户详细信息
if 'data' in results.keys():
for result in results.get('data'):
yield scrapy.Request(self.user_detail.format(user=result.get('url_token'), include=self.user_include),
callback=self.parse_user,dont_filter=True)
# 判断paging是否存在,如果存在并且is_end参数为False,则继续爬取下一页,如果is_end为True,说明为最后一页
if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
logging.warning("1111111111111111111111")
next_page = results.get('paging').get('next')
logging.warning (next_page)
yield scrapy.Request(next_page, callback=self.parse_follow,dont_filter=True)
2、编写items文件
Item是保存文件爬取数据的容器,使用方法和字典相同。
创建Item需要继承Scrapt.Item类,类型定义为scrapy.Field字段。
class UserItem(scrapy.Item):
"""
定义了响应报文中json的字段
"""
collection = 'zhihuUser'
is_followed = scrapy.Field()
avatar_url_template = scrapy.Field()
user_type = scrapy.Field()
answer_count = scrapy.Field()
is_following = scrapy.Field()
url = scrapy.Field()
type = scrapy.Field()
url_token = scrapy.Field()
id = scrapy.Field()
allow_message = scrapy.Field()
articles_count = scrapy.Field()
is_blocking = scrapy.Field()
name = scrapy.Field()
headline = scrapy.Field()
gender = scrapy.Field()
avatar_url = scrapy.Field()
follower_count = scrapy.Field()
is_org = scrapy.Field()
employments = scrapy.Field()
badge = scrapy.Field()
is_advertiser = scrapy.Field()
3、编写pipelines文件
这次的mongoDB使用update,为了的把重复的数据去掉
import pymongo
class MongoPipeline(object):
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(mongo_uri=crawler.settings.get('MONGO_URI'), mongo_db=crawler.settings.get('MONGO_DB'))
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def process_item(self, item, spider):
self.db[item.collection].update({'url_token': item['url_token']}, dict(item), True)
return item
def close_spider(self, spider):
self.client.close()
1)1) 在setting.py文件添加配置信息
MONGO_URI = 'localhost'
MONGO_DB = 'zhihu'
ITEM_PIPELINES = {
'loginScrapy.pipelines.LoginscrapyPipeline': 200,
'loginScrapy.pipelines.MongoPipeline': 300,
}
2)items 中增加
collection = ‘zhihuUser’
4、编写middlewares文件
本次使用的是selenium模拟登录知乎获取cookie,在下次的请求中加入cookie。
1 spider.browser.page_source 获取响应的源代码
2 session.get(request.url).text 获取响应的源代码
3 requests采用session管理cookie
4 urllib 采用cookieJar管理cookie
class LoginMiddleware(object):
'''
找到password username输入框并send_keys
点击登录并抓取cookie,spider.browser.get_cookies()
返回页面信息,HtmlResponse
'''
def process_request(self, request, spider):
if spider.name == "zhihu2": # 指定仅仅处理这个名称的爬虫
if request.url.find("signin") != -1: # 判断是否登陆页面
mobilesetting = {"deviceName": "iPhone 6 Plus"}
options = webdriver.ChromeOptions() # 浏览器选项
options.add_experimental_option("mobileEmulation", mobilesetting) # 模拟手机
spider.browser = webdriver.Chrome("D://360Downloads//Software//chromedriver.exe",chrome_options=options) # 创建一个浏览器对象
spider.browser.set_window_size(400, 800) # 配置手机大小
spider.browser.get(request.url) # 爬虫访问链接
time.sleep(3) # 必须要睡下因为考虑到输入:用户名密码 要时间
print("login访问", request.url)
ps_login=spider.browser.find_element_by_xpath('//*[@id="root"]/div/main/div/div[2]/div/button')
ps_login.click()
username = spider.browser.find_element_by_xpath('//input[@name="username"]')
password = spider.browser.find_element_by_xpath('//input[@name="password"]')
time.sleep(1)
username.send_keys("XXX") # 账户
time.sleep(2)
password.send_keys("XXX") # 密码
time.sleep(2)
spider.browser.find_element_by_xpath('//button[@type="submit"]').click()
time.sleep(4)
spider.cookies = spider.browser.get_cookies() # 抓取全部的cookie
# spider.browser.close()
return HtmlResponse(url=spider.browser.current_url, # 当前连接
body=spider.browser.page_source, # 源代码
encoding="utf-8") # 返回页面信息
else: # 登录后则执行
'''
1 采用requests.session保存cookie
2 设置cookie session.cookie.set(name,value)
3 清空headers session.headers.clear()
4 发起get请求 session.get(url)
'''
print("request 访问")
session = requests.session() # 会话
for cookie in spider.cookies:
session.cookies.set(cookie['name'], cookie["value"])
session.headers.clear() # 清空头
newpage = session.get(request.url)
print("---------------------")
print(request.url)
print("---------------------")
# print(newpage.text)
# print("---------------------")
# 页面
time.sleep(3)
return HtmlResponse(url=request.url, # 当前连接
body=newpage.text, # 源代码
encoding="utf-8") # 返回页面信息
1)在settings中配置
DOWNLOADER_MIDDLEWARES = {
# 'loginScrapy.middlewares.LoginscrapyDownloaderMiddleware': 543,
'loginScrapy.middlewares.RandomUserAgentMiddleware': 543,
'loginScrapy.middlewares.LoginMiddleware': 543,
}
代码分享如下:https://download.csdn.net/download/huangwencai123/11147561