说明
这里只放代码,方案技术没有更变
代码说明
需要cookies绕过登录,使用selenium在Firefox下模拟。需要安装geck...?插件,另外,数据存储在sqlite,需要安装。
Spider.py
import HtmlDownloader
import HtmlParser
import DataOutput
import UrlManager
import re
from selenium import webdriver
class Spider(object):
def __init__(self):
self.downloader=HtmlDownloader.HtmlDownloader()
self.parser=HtmlParser.HtmlParser()
self.output=DataOutput.DataOutput()
self.urlManager=UrlManager.UrlManager()
self.driver=webdriver.Firefox()
def crawl(self,root_url):
content=self.downloader.download_root(root_url,self.driver)
urls=self.parser.parser_url(content)
self.urlManager.add_urls(urls)
i=0
while self.urlManager.new_urls_size()>0 and self.urlManager.old_urls_size()<2000:
url=self.urlManager.get_new_url()
i=i+1
print(str(i)+':'+str(url))
pattern=re.compile('/.*?/')
user_name=re.findall(pattern,url)
url='https://home.cnblogs.com'+user_name[1]
content=self.downloader.download(self.driver,url)
new_urls=self.parser.parser_url(content)
self.urlManager.add_urls(new_urls)
try:
content=self.parser.parser_data(self.driver)
self.output.store_data(content)
except:
i=i-1
print('error url may not exits:'+self.driver.current_url)
self.output.output_end()
self.urlManager.save_status()
#self.driver.close()
print('ed')
if __name__=='__main__':
spider=Spider()
spider.crawl('https://www.cnblogs.com/')
UrlManager.py
import pickle
import hashlib
import re
class UrlManager():
def __init__(self):
self.old_urls=self.load_process('new_urls')
self.new_urls=self.load_process('old_urls')
def load_process(self,file_name):
print('loading .')
try:
with open(file_name,'rb') as f:
tmp=pickle.load(f)
return tmp
except:
print('file may not exist.will create it')
new_set=set()
self.save_process(file_name,new_set)
return new_set
def save_process(self,file_name,data):
with open(file_name,'wb') as f:
pickle.dump(data,f)
def save_status(self):
self.save_process('new_urls',self.new_urls)
self.save_process('old_urls',self.old_urls)
def add_urls(self,urls):
for url in urls:
m=hashlib.md5()
m.update(url.encode('utf8'))
url_md5=m.hexdigest()[8:-8]
if url not in self.new_urls and url_md5 not in self.old_urls:
self.new_urls.add(url)
def get_new_url(self):
new_url=self.new_urls.pop()
m=hashlib.md5()
m.update(new_url.encode('utf8'))
url_md5=m.hexdigest()[8:-8]
self.old_urls.add(url_md5)
return new_url
def new_urls_size(self):
return len(self.new_urls)
def old_urls_size(self):
return len(self.old_urls)
HtmlParser.py
import re
import json
class HtmlParser(object):
def parser_url(self,content):
pattern=re.compile(u'https://www.cnblogs.com/\w*/')
all_urls=re.findall(pattern,content)
all_urls=list(set(all_urls))
return all_urls
def parser_data(self,driver):
dict={}
user_id=driver.find_element_by_class_name('display_name').text
all_message=driver.find_element_by_class_name('user_profile').text
all_message=all_message.split('\n')
all_message.insert(0,'用户ID:'+user_id+'\n')
switch={'用户ID':'user_id',
'姓名':'name',
'性别':'sex',
'出生日期':'birth_day',
'家乡':'hometown',
'现居住地':'live_place',
'单位':'work_for',
'工作状况':'job_status',
'感兴趣的技术':'interest_technology',
'最近目标':'recent_goal',
'座右铭':'mark_words',
'自我介绍':'introduce',
'园龄':'blog_age',
'博客':'blog_address',
'婚姻':'marriage',
'职位':'position',
'QQ':'qq',
'Email':'email'
}
key=''
value=''
for each in all_message:
try:
each=each.replace('\n','')
key=switch[each.split(':')[0]]
value=each.split(':')[1]
dict[key]=value
except:
print('split error:'+each+'auto fixed..')
value=value+each
dict[key]=value
print(dict)
return dict
HtmlDownloader.py
import json
class HtmlDownloader(object):
def download_root(self,url,driver):
driver.get(url)
with open('cookies.json', 'r', encoding='utf-8') as f:
listCookies = json.loads(f.read())
for cookie in listCookies:
driver.add_cookie({
'domain': cookie['domain'], # 此处xxx.com前,需要带点
'name': cookie['name'],
'value': cookie['value']
})
driver.refresh()
return driver.page_source
def download(self,driver,url):
driver.get(url)
return driver.page_source
DataOutput.py
import sqlite3
class DataOutput(object):
def __init__(self):
self.cx=sqlite3.connect("cnblog.db")
self.table_name='cnblog'
self.create_table()
def create_table(self):
values='''
id integer primary key autoincrement,
user_id varchar(50) not null,
name varchar(50),
sex varchar(6),
birth_day varchar(30),
hometown varchar(50),
live_place varchar(50),
marriage varchar(20),
position varchar(30),
work_for varchar(50),
job_status varchar(20),
interest_technology varchar(200),
recent_goal varchar(500),
mark_words varchar(500),
introduce varchar(500),
blog_age varchar(30),
blog_address varchar(100),
qq varchar(15),
email varchar(30)
'''
self.cx.execute('create table if not exists %s(%s)' %(self.table_name,values))
def store_data(self,data):
flag=0
user_id=''
for key,value in data.items():
if flag==0:
cmd="insert into %s (%s) values ('%s')" %(self.table_name,key,value)
user_id=value
flag=1
else:
cmd='update %s set %s="%s" where user_id="%s"' %(self.table_name,key,value,user_id)
self.cx.execute(cmd)
self.cx.commit()
def output_end(self):
self.cx.close()