万维百科人物

该博客讲述了如何使用Scrapy爬虫从万维百科抓取名人出生和去世日期,并结合Python进行八字计算。通过创建MySQL数据库存储爬取的数据,并使用pandas进行数据处理,最终将八字信息补充入库。此外,还讨论了在遇到网络限制时如何检查Scrapyd任务状态。
摘要由CSDN通过智能技术生成

中文维基百科访问不了,还好有个万维百科,可以从中获取名人的出生日期,虽然只有六个字,至少国外的人物出生日期想对比较准确。
1 建库脚本

create database if not exists wiki_person
    default character set utf8
    default collate utf8_general_ci;

show databases;

use wiki_person;
DROP TABLE IF EXISTS `life`;
CREATE TABLE `life` (
  `id` char(32) NOT NULL DEFAULT '' COMMENT 'ID',
  `name` varchar(100) NOT NULL DEFAULT '' COMMENT '姓名',
  `cs` varchar(11) NOT NULL DEFAULT '' COMMENT '出生日期',
  `qs` varchar(11) NOT NULL DEFAULT '' COMMENT '去世日期',
  `sex` char(1) NOT NULL DEFAULT '' COMMENT '性别',
  `url` varchar(200) NOT NULL DEFAULT '' COMMENT '链接',
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

-- 修正字段
ALTER TABLE `life`
ADD COLUMN `cs_bazi`  char(8) NULL AFTER `url`;

-- 将表life导出sql,其中wiki_person为database,life为table名
mysqldump -u root -p wiki_person life > /appdata/life20210215.sql

2 scrapy

import scrapy
from pyquery import PyQuery as pq
import re
from ..items import LifeItem
from ..util import str_util

class SpiderQs1900(scrapy.Spider):
    name = 'spider_qs_1900'
    prefix = 'https://www.wanweibaike.com/'
    allowed_domains = ['www.wanweibaike.com']
    start_urls=['https://www.wanweibaike.com']

    def parse(self,response):
        for year in range(1900,1910):
            url = "https://www.wanweibaike.com/wiki/Category-{}%E5%B9%B4%E9%80%9D%E4%B8%96".format(year)
            print(url)
            yield scrapy.Request(url,self.parse_cur_page)

    def parse_cur_page(self,response):
        soup = pq(response.body_as_unicode())
        person_links = soup('.mw-category a')
        for person_link in person_links:
            person_name = pq(person_link).text()
            href = pq(person_link).attr('href')
            if re.match('^/wiki-\d{4}',href) is None:
                url = self.prefix+href
                yield scrapy.Request(url,self.parse_person)

    def parse_person(self, response):
        '''
        解析人物
        '''
        soup = pq(response.text)
        # id
        url = response.url
        id = str_util.md5(url)
        # 姓名
        name = soup('.content h1').text()
        ######################################## 寿命 ########################################
        content = soup('#mw-content-text p:lt(2)').text()
        if len(re.findall('(她|女性)',content))>0:
            sex = '女'
        else:
            sex = '男'
        cs_qses = re.findall('((.+?))',content)
        if cs_qses is None or len(cs_qses)==0:
            pass
        cs_qs = cs_qses[0]
        rqs = re.findall('\d{4}年\d{1,2}月\d{1,2}日',cs_qs)
        if rqs is None or len(rqs)==0:
            pass
        else:
            # 有出生日期、逝世日期才进入一下阶段,没有的话,没有统计的意义
            life_item = self.gen_life_item(id,name,sex,url,rqs[0],rqs[1])
            yield life_item

    def gen_life_item(self,id,name,sex,url,cs,qs):
        '''
        寿命item
        '''
        life_item = LifeItem()
        life_item['id'] = id
        life_item['name'] = name
        life_item['sex'] = sex
        life_item['url']=url
        # 出生日期
        life_item['cs'] = cs
        # 去世日期
        life_item['qs']=qs
        return life_item

scrapyd运行半天,没看到数据有什么更新,因为广电的网络不知道为什么云服务器设置不了白名单,因此也无法通过浏览器查看运行状态,只能通过命令查看

from scrapyd_api import ScrapydAPI
scrapyd = ScrapydAPI('http://localhost:6800')
scrapyd.list_jobs('wanwiki')

{'node_name': 'sp1.test.com.cn', 'pending': [], 'running': [{'id': 'b46887fe6b4a11ebaa86fa163e8a50e0', 'spider': 'spider_hy', 'pid': 8895, 'start_time': '2021-02-10 10:50:18.443425'}], 'finished': [{'id': '79a4d6da6ac911ebaa86fa163e8a50e0', 'spider': 'spider_qs_1900', 'start_time': '2021-02-09 19:25:13.584024', 'end_time': '2021-02-09 20:35:19.403491'}, {'id': '9f8a77246ac911ebaa86fa163e8a50e0', 'spider': 'spider_qs_1910', 'start_time': '2021-02-09 19:26:18.444328', 'end_time': '2021-02-09 21:03:53.777995'}]}

发布到scrapyd

cd /appdata/wiki/wanwiki
scrapyd-deploy -p wanwiki
# 启动
curl http://localhost:6800/schedule.json -d project=wanwiki -d spider=spider_hy_2020
# 取消
curl http://localhost:6800/cancel.json -d project=wanwiki -d job=b46887fe6b4a11ebaa86fa163e8a50e0

3 获取八字
下面使用的是pandas读取mysql,没有使用分页,下次看能不能整理一个像java那样的miemie分页框架。
3.1 获取八字1
下面这个八字在2021年2月10日还是可以用的,但是2021年2月11日就不能用了,因为浏览器不支持flash了。

# -*- coding: utf-8 -*-
# @time    : 2021/2/10 11:56
# @author  : dzm
# @dsec    :
import requests
import time
import random
import urllib
from pyquery import PyQuery as pq
import mysql_helper as helper
import pandas as pd
from db_engine import engine_wiki
import re
import json

def get_cookies():
    sess = requests.session()
    resp = sess.get('https://www.china95.net/paipan/bazi/')
    return sess, resp.cookies.get_dict()

def get_cs_bazi(year,month,day,sex,sess,cookies):
    time.sleep(random.randrange(2,5))
    params= {'name':'','area':'','sex':sex,'year':year,'month':month,'date':day
        ,'hour':'9','minute':'20','jingdu':120,'jingdufen':0,'taiyang':0,'quanpai':1,'stylebazi':'120','stylebazi':'0'}
    params['submit']=urllib.parse.quote('在线排八字盘'.encode('gb2312'))
    url = 'https://www.china95.net/paipan/bazi/bazi_show.asp'
    r = sess.post(url,data=params,cookies=cookies,verify=False)
    r.encoding = 'gb2312'
    #
    tiangan = pq(r.text)('table.stylebazi tr>td font:eq(22)').text()
    tiangans = tiangan.split('     ')
    dizhi = pq(r.text)('table.stylebazi tr>td font:eq(24)').text()
    dizhis = dizhi.split('     ')
    #
    cs_year = tiangans[0]+dizhis[0]
    cs_month = tiangans[1]+dizhis[1]
    cs_day = tiangans[2]+dizhis[2]
    cs_bazi = cs_year + " " + cs_month + " " + cs_day
    #
    start = pq(r.text)('table.stylebazi tr>td b:eq(16)').next().text().replace('\n','')
    dayuns = pq(r.text)('table.stylebazi tr>td b:eq(13)').next().text().split('  ')
    years = pq(r.text)('table.stylebazi tr>td b:eq(15)').next().text().split('  ')
    return cs_year,cs_month,cs_day,cs_bazi,start,dayuns,years

def get_qs_bazi(year,month,day,sex,dayuns,years,sess,cookies):
    time.sleep(random.randrange(2,5))
    params= {'name':'','area':'','sex':sex,'year':year,'month':month,'date':day
        ,'hour':'9','minute':'20','jingdu':120,'jingdufen':0,'taiyang':0,'quanpai':1,'stylebazi':'120','stylebazi':'0'}
    params['submit']=urllib.parse.quote('在线排八字盘'.encode('gb2312'))
    url = 'https://www.china95.net/paipan/bazi/bazi_show.asp'
    r = sess.post(url,data=params,cookies=cookies,verify=False)
    r.encoding = 'gb2312'
    #
    tiangan = pq(r.text)('table.stylebazi tr>td font:eq(22)').text()
    tiangans = tiangan.split('     ')
    dizhi = pq(r.text)('table.stylebazi tr>td font:eq(24)').text()
    dizhis = dizhi.split('     ')
    #
    qs_year = tiangans[0]+dizhis[0]
    qs_month = tiangans[1]+dizhis[1]
    qs_day = tiangans[2]+dizhis[2]
    qs_bazi = qs_year + " " + qs_month + " " + qs_day
    #
    dy = get_dayun(dayuns,years,year)
    #
    return qs_year,qs_month,qs_day,qs_bazi,dy

def get_dayun(dayuns,years,current):
    for i,val in enumerate(years):
        if int(current)>=int(val) and int(current)<int(val)+10:
            return dayuns[i]

def get_datas():
    sess,cookies = get_cookies()
    sql = r'select * from life'
    df = pd.read_sql(sql,engine_wiki())
    results = json.loads(df.to_json(orient='records'))
    year_p = re.compile(r'\d+(?=年)')
    month_p = re.compile(r'(?<=年)\d+(?=月)')
    day_p = re.compile(r'(?<=月)\d+(?=日)')
    #
    for result in results:
        year = year_p.search(result['cs']).group()
        month = month_p.search(result['cs']).group()
        day = day_p.search(result['cs']).group()
        if result['sex']=='男':
            sex = 1
        else:
            sex = 0
        if int(year)<1884:
            continue
        print(result['id'])
        cs_year,cs_month,cs_day,cs_bazi,ln,dayuns,years = get_cs_bazi(year,month,day,sex,sess,cookies)
        result['cs_year'] = cs_year
        result['cs_month'] = cs_month
        result['cs_day'] = cs_day
        result['cs_bazi'] = cs_bazi
        result['ln'] = ln
        #
        year = year_p.search(result['qs']).group()
        month = month_p.search(result['qs']).group()
        day = day_p.search(result['qs']).group()
        qs_year,qs_month,qs_day,qs_bazi,dy = get_qs_bazi(year,month,day,sex,dayuns,years,sess,cookies)
        result['qs_year'] = qs_year
        result['qs_month'] = qs_month
        result['qs_day'] = qs_day
        result['qs_bazi'] = qs_bazi
        result['dy'] = dy
        #
        helper.to_sql('life',engine_wiki(),pd.DataFrame([result]))


if __name__ == '__main__':
    get_datas()

3.2 获取八字2

# -*- coding: utf-8 -*-
# @time    : 2021/2/11 10:34
# @author  : dzm
# @dsec    : 
import requests
import time
import random
import urllib
from pyquery import PyQuery as pq
import mysql_helper as helper
import pandas as pd
from db_engine import engine_wiki
import re
import json

def get_cs_bazi(year,month,day,sex):
    params={'txtName':'求测者','rdoSex':sex,'zty':0,'pid':'','cid':'请选择城市','data_type':0,'NianXian':1,'cboYear':year,'cboMonth':month
            ,'cboDay':day,'cboHour':'8-辰','cboMinute':'30','YearGan':1,'YearZhi':1,'MoonGan':1,'MoonZhi':1,'DayGan':1,'DayZhi':1
            ,'HourGan':1,'HourZhi':1}
    url = r'https://pp.buyiju.com/bazi/bazi.asp'
    r = requests.post(url,data=params)
    r.encoding = 'utf-8'
    soup = pq(r.text)
    zhu = soup('.content table:eq(0) > tr:eq(1) td:gt(0):lt(4)').text()
    if zhu:
        zhus = zhu.split(' ')
        cs_year = zhus[0]
        cs_month = zhus[1]
        cs_day = zhus[2]
        cs_bazi = cs_year + " " + cs_month + " " + cs_day
        #
        dayuns = []
        dys = soup('.content table:eq(1) > tr:eq(1) td:gt(0)')
        for dy in dys:
            dayuns.append(pq(dy).text())
        years = []
        ys = soup('.content table:eq(1) > tr:eq(4) td:gt(0)')
        for y in ys:
            years.append(pq(y).text())
        #
        start = soup('.content table:eq(1) > tr:eq(5)  td:eq(1)').text()
        return cs_year,cs_month,cs_day,cs_bazi,dayuns,years,start
    else:
        return None,None,None,None,None,None,None

def get_qs_bazi(year,month,day,sex,dayuns,years):
    params={'txtName':'求测者','rdoSex':sex,'zty':0,'pid':'','cid':'请选择城市','data_type':0,'NianXian':1,'cboYear':year,'cboMonth':month
        ,'cboDay':day,'cboHour':'8-辰','cboMinute':'30','YearGan':1,'YearZhi':1,'MoonGan':1,'MoonZhi':1,'DayGan':1,'DayZhi':1
        ,'HourGan':1,'HourZhi':1}
    url = r'https://pp.buyiju.com/bazi/bazi.asp'
    r = requests.post(url,data=params)
    r.encoding = 'utf-8'
    soup = pq(r.text)
    #
    zhu = soup('.content table:eq(0) > tr:eq(1) td:gt(0):lt(4)').text()
    if zhu:
        zhus = zhu.split(' ')
        qs_year = zhus[0]
        qs_month = zhus[1]
        qs_day = zhus[2]
        qs_bazi = qs_year + " " + qs_month + " " + qs_day
        #
        dy = get_dayun(dayuns,years,year)
        return qs_year,qs_month,qs_day,qs_bazi,dy
    else:
        return None,None,None,None,None

def get_dayun(dayuns,years,current):
    for i,val in enumerate(years):
        if int(current)>=int(val) and int(current)<int(val)+10:
            return dayuns[i]

def get_datas():
    sql = r'select * from life'
    df = pd.read_sql(sql,engine_wiki())
    results = json.loads(df.to_json(orient='records'))
    year_p = re.compile(r'\d+(?=年)')
    month_p = re.compile(r'(?<=年)\d+(?=月)')
    day_p = re.compile(r'(?<=月)\d+(?=日)')
    #
    for result in results:
        year = year_p.search(result['cs']).group()
        month = month_p.search(result['cs']).group()
        day = day_p.search(result['cs']).group()
        if result['sex']=='男':
            sex = 1
        else:
            sex = 0
        # if int(year)<1884:
        #     continue
        print(result['id'])
        cs_year,cs_month,cs_day,cs_bazi,dayuns,years,ln = get_cs_bazi(year,month,day,sex)
        if cs_year is None:
            continue
        result['cs_year'] = cs_year
        result['cs_month'] = cs_month
        result['cs_day'] = cs_day
        result['cs_bazi'] = cs_bazi
        result['ln'] = ln
        #
        year = year_p.search(result['qs']).group()
        month = month_p.search(result['qs']).group()
        day = day_p.search(result['qs']).group()
        qs_year,qs_month,qs_day,qs_bazi,dy = get_qs_bazi(year,month,day,sex,dayuns,years)
        if qs_year is None:
            continue
        result['qs_year'] = qs_year
        result['qs_month'] = qs_month
        result['qs_day'] = qs_day
        result['qs_bazi'] = qs_bazi
        result['dy'] = dy
        #
        helper.to_sql('life',engine_wiki(),pd.DataFrame([result]))

if __name__ == '__main__':
    # cs_year,cs_month,cs_day,cs_bazi,dayuns,years,start = get_cs_bazi(1894,2,29,1)
    # print(cs_year,cs_month,cs_day,cs_bazi,dayuns,years,start)
    get_datas()
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

warrah

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值