见《宋书·范晔传》:“言之皆有实证,非为空谈。”子平有较高的或然率,但如果没有人物经历来佐证,就变成三教九流,成为“玄学”实在可惜。老外搞个mbti性格测试就巴巴说是科学,有智慧的老前辈总结的经验,因为不懂而无法传承,散落在明间成为偷偷么么被人看不起,实在是看不过去。
有时候感觉西方人很笨,调研70多个家庭跟踪他们的一生,然后给出结论发表论文。然后中国的学生就认为人家严谨有科学研究精神,何曾想过这70个样本想对人类这么大基数根本不值得一提。另外一个视角,研究问题真的需要采用这么笨的方法吗?梁湘润大师等都说看一个人生的70%就不错了,是的人的极限估计也差不多是这些,另外20~30%还需要靠国运、环境和个人修养,人之一生岂是几百页的子平能讲完的呢,那岂不是白活。
我是会一些计算机,另外对梁老的一些观点特别认同,大部分人都是普通人,每天为了家计小常奔波,为了妻财子禄寿而焦虑,要想做人上人,你真的能承受他们所经历的吗?
这篇博客主要是从百度百科上采集人物经历,后面再结合子平理论进行实证。
1 采集人物信息
# -*- coding: utf-8 -*-
# @time : 2022/1/22 11:03
# @author : dzm
# @dsec : 百度娱乐人物
from sqlalchemy.engine import create_engine
from personspider.settings import MYSQL_CONFIG
import scrapy
from pyquery import PyQuery as pq
from personspider.utils import str_util,person_util
from personspider.items.baidu_person import BaiduPersonItem,BaiduPersonExperienceItem\
,BaiduUrlItem,BaiduPersonRelationItem
from personspider.service.baidu_service import BaiduUrlService
import re
class yulespider(scrapy.Spider):
name = 'baidu_yule'
def start_requests(self):
# 从数据库中读取链接
engine = create_engine('mysql+pymysql://{}:{}@{}:3306/{}'.format(MYSQL_CONFIG['user'], MYSQL_CONFIG['password'],
MYSQL_CONFIG['host'], MYSQL_CONFIG['db']),
connect_args={'charset': 'utf8'}, pool_size=MYSQL_CONFIG['pool_size'])
baiduUrlService = BaiduUrlService(engine)
urls = baiduUrlService.get_urls()
if urls:
for url in urls:
yield scrapy.Request(url=url.url, callback=self.parse,dont_filter=True)
def parse(self, response):
cur_url = response.request.url
cur_url = person_util.get_url(cur_url)
soup = pq(response.text)
# 人物基本信息
basicInfo_blocks = soup('.basic-info .basicInfo-block')
item = {}
for basicInfo_block in basicInfo_blocks:
size = pq(basicInfo_block)('dt').size()
for i in range(size):
name = pq(basicInfo_block)('dt:eq({})'.format(i)).text()
name = str_util.clear(name)
value = pq(basicInfo_block)('dd:eq({})'.format(i)).text()
value = str_util.clear(value)
item[name] = value
person_item = self.pack_person(item, r'百度百科', cur_url)
cur_name = person_item['cn_name']
if 'birthday' in person_item.keys():
print('中文名:{}, 出生日期:{},链接:{}'.format(person_item['cn_name'],person_item['birthday'],person_item['url']))
else:
print('中文名:{}, 链接:{}'.format(person_item['cn_name'],person_item['url']))
# 出生日期不能为空,且需要有完整的年月日从中获取人物经历
valid_person = person_item and 'birthday' in person_item.keys() \
and person_item['birthday'] \
and re.match('\d{4}[年\-]\d{1,2}[月\-]\d{1,2}日?',person_item['birthday'])
if valid_person:
person_id = person_item['id']
yield person_item
# 人物经历
paras = soup('.para')
for para in paras:
content = pq(para).text()
if re.match('^\d{4}年',content) and not re.match('^\d{4}年\d{1,2}月\d{1,2}日',content):
experiences = person_util.get_experience(content)
if experiences:
for experience in experiences:
if experience and experience['experience']:
exp_item = BaiduPersonExperienceItem()
exp_item['id'] = str_util.gen_md5(experience)
exp_item['person_id'] = person_id
exp_item['year'] = experience['year']
if 'month' in experience.keys():
exp_item['month'] = experience['month']
exp_item['experience'] = experience['experience']
yield exp_item
# 正在采集的链接
curl_url_item = BaiduUrlItem()
curl_url_item['id'] = str_util.gen_md5(cur_url)
curl_url_item['url'] = cur_url
curl_url_item['status'] = '1'
curl_url_item['name'] = name
yield curl_url_item
# 人物关系
relations = soup('.relations li')
if relations:
for i in range(len(relations)):
relation = relations[i]
url = 'https://baike.baidu.com' + pq(relation)('a').attr('href')
url = person_util.get_url(url)
# 人物关系取值各有不同
name = pq(relation)('.title').text()
if name:
tag = pq(relation)('.name').text()
else:
name = pq(relation)('.name').attr('title')
tag = pq(relation)('.name').text()
tag = tag[:len(tag)-len(name)]
if valid_person:
# 关系
relation_item = BaiduPersonRelationItem()
relation_item['one'] = person_id
relation_item['one_name'] = cur_name
relation_item['one_url'] = cur_url
relation_item['two'] = str_util.gen_md5(url)
relation_item['two_name'] = name
relation_item['two_url'] = url
relation_item['relation'] = tag
yield relation_item
# 链接
url_item = BaiduUrlItem()
url_item['id'] = str_util.gen_md5(url)
url_item['url'] = url
url_item['status'] = '0'
url_item['name'] = name
yield url_item
def pack_person(self,content,source, url):
if content:
item = BaiduPersonItem()
item['source'] = source
item['url'] = url
item['id'] = str_util.gen_md5(url)
for key in content.keys():
if key == '中文名':
item['cn_name'] = content[key]
elif key == '外文名':
item['en_name'] = content[key]
elif key == '性别':
item['sex'] = content[key]
elif key == '国籍':
item['nation'] = content[key]
elif key == '出生日期':
birthday = re.search(r'\d{4}[年\-]\d{1,2}[月\-]\d{1,2}日?',content[key], re.S)
if birthday:
item['birthday'] = birthday.group(0)
elif key == '出生地':
item['birthplace'] = content[key]
elif key == '外文名':
item['deathday'] = content[key]
elif key == '身高':
item['height'] = person_util.get_height(content[key])
elif key == '毕业院校':
item['school'] = content[key]
elif key == '职业':
item['occupation'] = content[key]
elif key == '主要成就':
item['achievements'] = content[key]
elif key == '代表作品':
item['representation'] = content[key]
return item
return None
if __name__ == '__main__':
pass
2 提取内容
import re
from personspider.utils import str_util
def get_url(url):
index = url.index('?') if '?' in url else None
if index:
return url[:index]
else:
return url
def get_experience(text):
'''
获取经历
'''
pattern = r'(\d{4}年)'
results = re.split(pattern,text,re.S)
size = len(results)
contents = []
i = 1
while i < size-1:
# 年份
year = results[i]
# 经历
experience = results[i+1].strip(',') #去掉首字母,
experience = str_util.clear(experience)
# 月
result = re.search(r'\d{1,2}月', experience)
if result:
months = re.split(r'(\d{1,2}月)',experience,re.S)
j = 1
while j<len(months)-1:
month = months[j]
experience = months[j+1].strip(',')
experience = str_util.clear(experience)
contents.append({'year':year,'month':month,'experience':experience})
j = j+2
else:
contents.append({'year':year,'experience':experience})
i = i+2
return contents
def get_height(height):
height = re.search(r'\d{1,4}(cm)?',height, re.S)
if height:
return height.group(0)
else:
return height
import hashlib
import re
def gen_md5(item):
'''
将字符串转md5
'''
m = hashlib.md5()
md5 = str(item).encode('utf-8')
m.update(md5)
md5 = m.hexdigest()
return md5
def remove_xa0(value):
'''
\xa0 是不间断空白符
'''
return value.replace(u'\xa0',u'')
def remove_quote(value):
p = re.compile('\[[\d\-\]]+')
return p.sub("",value)
def remove_blank(value):
return value.replace(' ','')
def clear(value):
value = remove_xa0(value)
value = remove_quote(value)
value = remove_blank(value)
return value
3 数据管道
from sqlalchemy.engine import create_engine
from personspider.items.baidu_person import BaiduPersonItem,BaiduPersonExperienceItem\
,BaiduUrlItem,BaiduPersonRelationItem
from personspider.service.baidu_service import BaiduPersonService,BaiduPersonExperienceService\
,BaiduPerson,BaiduPersonExperience,BaiduPersonRelation,BaiduUrl\
,BaiduPersonRelationService,BaiduUrlService
class MysqlPipeline(object):
def __init__(self, engine):
self.baiduPersonService = BaiduPersonService(engine)
self.baiduPersonExperienceService = BaiduPersonExperienceService(engine)
self.baiduUrlService = BaiduUrlService(engine)
self.baiduPersonRelationService = BaiduPersonRelationService(engine)
def process_item(self, item, spider):
if type(item) == BaiduPersonItem:
record = BaiduPerson(**item)
self.baiduPersonService.insert(record)
elif type(item) == BaiduPersonExperienceItem:
record = BaiduPersonExperience(**item)
self.baiduPersonExperienceService.insert(record)
elif type(item) == BaiduUrlItem:
record = BaiduUrl(**item)
self.baiduUrlService.insert(record)
elif type(item) == BaiduPersonRelationItem:
record = BaiduPersonRelation(**item)
self.baiduPersonRelationService.insert(record)
@classmethod
def from_settings(cls,settings):
mysql_config = settings.get('MYSQL_CONFIG')
engine = create_engine('mysql+pymysql://{}:{}@{}:3306/{}'.format(mysql_config['user'], mysql_config['password'],
mysql_config['host'], mysql_config['db']),
connect_args={'charset': 'utf8'}, pool_size=mysql_config['pool_size'])
return cls(engine)
4 写数据库
class BaiduPersonService(object):
def __init__(self, engine):
self.engine = engine
Session = sessionmaker(engine)
self.session = Session()
self.emailService = EmailService()
def exist(self, id):
query = self.session.query(BaiduPerson).filter(BaiduPerson.id==id)
return query.count()>0
def insert(self, record):
if not self.exist(record.id):
try:
record.create_time = datetime.datetime.now()
self.session.add(record)
self.session.commit()
except Exception as e:
title = r'{}写入数据库失败'.format(record.cn_name)
content = r'ERROR {}'.format(str(e))
self.emailService.sendEmail(title,content)
5 异常邮件发送
如果在解析过程中出现异常,总不需要时刻盯着吧,写个发邮件告知我,岂不是很安逸
import smtplib
from email.header import Header
from email.mime.text import MIMEText
from personspider.settings import EMAIL_CONFIG
class EmailService(object):
def sendEmail(self,title, content):
message = MIMEText(content, 'plain', 'utf-8') # 内容, 格式, 编码
message['From'] = "{}".format(EMAIL_CONFIG['sender'])
message['To'] = ",".join(EMAIL_CONFIG['receivers'])
message['Subject'] = title
try:
smtpObj = smtplib.SMTP_SSL(EMAIL_CONFIG['smtp']['host'], 465) # 启用SSL发信, 端口一般是465
smtpObj.login(EMAIL_CONFIG['smtp']['user'], EMAIL_CONFIG['smtp']['password']) # 登录验证
smtpObj.sendmail(EMAIL_CONFIG['sender'], EMAIL_CONFIG['receivers'], message.as_string()) # 发送
print("mail has been send successfully.")
except smtplib.SMTPException as e:
print(e)
def send_email2(SMTP_host, from_account, from_passwd, to_account, subject, content):
email_client = smtplib.SMTP(SMTP_host)
email_client.login(from_account, from_passwd)
# create msg
msg = MIMEText(content, 'plain', 'utf-8')
msg['Subject'] = Header(subject, 'utf-8') # subject
msg['From'] = from_account
msg['To'] = to_account
email_client.sendmail(from_account, to_account, msg.as_string())
email_client.quit()
if __name__ == '__main__':
emailService = EmailService()
title = r'数据库异常'
content = r'很多多多问题'
emailService.sendEmail(title, content)