# coding:utf-8 import requests from lxml import etree is_next=True #全局变量 class csdncrawl(): #获取登录时所需的post参数 def get_params(self,username,password,post_url,post_session,post_headers): param_username=username param_password=password get_url=post_url get_session=post_session get_headers=post_headers index_page=get_session.get(get_url,headers=get_headers) html=etree.HTML(index_page.text) lt=html.xpath(".//input[@name='lt']//@value")[0] execution=html.xpath(".//input[@name='execution']//@value")[0] _eventId=html.xpath(".//input[@name='_eventId']//@value")[0] postdata = { 'username':param_username, 'password':param_password, 'lt':lt, 'execution':execution, '_eventId':_eventId, } return postdata #登录函数 def csdn_login(self,username,password,index_url,session,headers): login_username=username login_password=password post_url = index_url post_session=session post_headers=headers postdata = self.get_params(login_username,login_password,post_url,post_session,post_headers) post_session.post(post_url,data=postdata,headers=post_headers) #启动爬虫函数 def startcrawl(self,session): username = 'zkwniky' password = '+++++++' start_page_number=1 dict_blog={} index_url = 'https://passport.csdn.net/account/login' agent = 'Mozilla/5.0 (Windows NT 5.1; rv:33.0) Gecko/20100101 Firefox/33.0' headers = { 'User-Agent': agent } start_session = session self.csdn_login(username,password,index_url,start_session,headers)#成功登录 self.crawl_person_csdn(session,headers) #进入个人中心 self.crawl_blog_list(dict_blog,username,session,headers,start_page_number)#进入个人博客 #爬取个人中心,确定登录成功 def crawl_person_csdn(self,session,headers): person_url='http://my.csdn.net/my/mycsdn' person_session=session person_headers=headers person=person_session.get(person_url,headers=person_headers) print person.text #爬取我的博客列表 def crawl_blog_list(self,dict_blog,username,session,headers,start_page_number): global is_next page_number=start_page_number dict_blog_list = dict_blog while is_next: blog_username=username blog_url='http://blog.csdn.net/'+blog_username+'/article/list/'+str(page_number) blog_session=session blog_headers=headers blog_page=blog_session.get(blog_url,headers=blog_headers) print blog_page.text html = etree.HTML(blog_page.text) href= html.xpath(".//span[@class='link_title']//a//@href") title = html.xpath(".//span[@class='link_title']//a/text()") current_page_number=html.xpath(".//div[@class='pagelist']//strong/text()") last_page_number= html.xpath(".//div[@class='pagelist']//a//@href") i = 0 while i < len(href): dict_blog_list['http://blog.csdn.net' + href[i]] = title[i] i += 1 if self.judge_next_page(current_page_number[0],last_page_number[-1][-1]): page_number+=1 print len(dict_blog_list) return dict_blog_list#判断博客是否有下一页def judge_next_page(self,current_page,next_page): global is_next if current_page<next_page: #还有下一页 is_next=True else: is_next=False return is_nextif __name__=='__main__': session=requests.session() csdncrawl=csdncrawl() csdncrawl.startcrawl(session) #成功登录
解释如下:
1)
整体过程比较简单,登录时post数据如下:
eventId=submit
execution=e1s1
lt=LT-597060-IAanNajzYkoNV67gnQpFNT9m7goQ7U
password=++++++
username=zkwniky
其中前三个的值需要在登录页面中的隐藏标签中获取)
2)
判断是否有下一页时,使用了当前页面数最小的方法
3)
python 2.7 执行成功
4)登录到个人中心时 返回json数据地址如下
全部文章
http://my.csdn.net/my/mycsdn/get_read_list?lastId=-&size=10&direction=down&type=
热门博客列表
http://my.csdn.net/my/mycsdn/get_hot_blog_list?pageno=1&pagesize=5&username=zkwniky
热门资源列表
http://my.csdn.net/my/mycsdn/get_hot_download_list
热门搜索 :java,python,spring,mysql,php
http://so.csdn.net/so/search/hotQuery.do?&callback=jQuery19009254916422648101_1501121642771&size=5&_=1501121642773
精彩回答
http://my.csdn.net/my/mycsdn/get_ask_list
爬虫 登录csdn并获取个人博客文章列表
最新推荐文章于 2024-01-10 17:33:44 发布