带二级页面星尘算命网的爬虫

爬虫的代码比较简单,因为没有什么反爬,我比较喜欢用PyQuery,使用起来像jquery一样方便

#!/usr/bin/python3
# -*- coding: utf-8 -*-
import scrapy
from pyquery import PyQuery as pq
import re
from life_example.items import LifeExampleItem

class S128Spider(scrapy.Spider):
    name = "s128"
    start_urls = [
        "http://www.s128.com/lb.php"
    ]
    page = 1

    def parse(self,response):
        soup = pq(response.body_as_unicode())
        lis = soup('.fly-list li')
        for li in lis:
            li_ = soup(li)
            href = li_('h2>a').attr('href')
            title = li_('h2>a').text()
            url = 'http://www.s128.com/'+ href
            yield scrapy.Request(url=url,callback=self.parse_detail)
        if self.page<=353:
            self.page = self.page + 1
            url = 'http://www.s128.com/lb.php?t=&p='+str(self.page)
            yield scrapy.Request(url=url,callback=self.parse)

    def parse_detail(self,response):
        soup = pq(response.body_as_unicode())
        title = soup('.moudle-card h1').text()
        item = LifeExampleItem()
        item['year'] = title[0:2]
        item['month'] = title[2:4]
        item['day'] = title[4:6]
        item['hour'] = title[6:8]
        item['bazi'] = item['year'] + ' ' + item['month'] + ' '+ item['day'] + ' '+ item['hour']
        if title[9:11] == '男命':
            item['sex'] = '01'
        else:
            item['sex'] = '00'
        item['name'] = title[12:]
        content = soup('.con.layui-text').text()
        item['concise_evalute'] = content.strip()
        yield item

管道代码

class LifeExamplePipeline(object):
    # def open_spider(self,spider):
    #     self.out = open('s128.csv','a',encoding='utf-8')
    #     self.cr = csv.writer(self.out)

    def process_item(self, item, spider):
        if type(item) == LifeExampleItem:
            item['id'] = str.replace(str(uuid.uuid1()),'-','')
            to_sql('bazi_person',engine_bazi(),dataframe=pd.DataFrame([item]))

写入mysql数据的代码

import os
import pandas as pd
import shutil
import sys
import re
import time
import pymysql



def sql_cols(df, usage="sql"):
    cols = tuple(df.columns)
    if usage == "sql":
        cols_str = str(cols).replace("'", "`")
        if len(df.columns) == 1:
            cols_str = cols_str[:-2] + ")"  # to process dataframe with only one column
        return cols_str
    elif usage == "format":
        base = "'%%(%s)s'" % cols[0]
        for col in cols[1:]:
            base += ", '%%(%s)s'" % col
        return base
    elif usage == "values":
        base = "%s=VALUES(%s)" % (cols[0], cols[0])
        for col in cols[1:]:
            base += ", `%s`=VALUES(`%s`)" % (col, col)
        return base


def to_sql(tb_name, conn, dataframe, type="update", chunksize=2000, debug=False):
    """
    Dummy of pandas.to_sql, support "REPLACE INTO ..." and "INSERT ... ON DUPLICATE KEY UPDATE (keys) VALUES (values)"
    SQL statement.

    Args:
        tb_name: str
            Table to insert get_data;
        conn:
            DBAPI Instance
        dataframe: pandas.DataFrame
            Dataframe instance
        type: str, optional {"update", "replace", "ignore"}, default "update"
            Specified the way to update get_data. If "update", then `conn` will execute "INSERT ... ON DUPLICATE UPDATE ..."
            SQL statement, else if "replace" chosen, then "REPLACE ..." SQL statement will be executed; else if "ignore" chosen,
            then "INSERT IGNORE ..." will be excuted;
        chunksize: int
            Size of records to be inserted each time;
        **kwargs:

    Returns:
        None
    """
    if len(dataframe) == 0:
        return

    tb_name = ".".join(["`" + x + "`" for x in tb_name.split(".")])

    df = dataframe.copy(deep=False)
    df = df.fillna("None")
    df = df.applymap(lambda x: re.sub('([\'\"\\\])', '\\\\\g<1>', str(x)))
    cols_str = sql_cols(df)
    sqls = []
    for i in range(0, len(df), chunksize):
        # print("chunk-{no}, size-{size}".format(no=str(i/chunksize), size=chunksize))
        df_tmp = df[i: i + chunksize]

        if type == "replace":
            sql_base = "REPLACE INTO {tb_name} {cols}".format(
                tb_name=tb_name,
                cols=cols_str
            )

        elif type == "update":
            sql_base = "INSERT INTO {tb_name} {cols}".format(
                tb_name=tb_name,
                cols=cols_str
            )
            sql_update = "ON DUPLICATE KEY UPDATE {0}".format(
                sql_cols(df_tmp, "values")
            )

        elif type == "ignore":
            sql_base = "INSERT IGNORE INTO {tb_name} {cols}".format(
                tb_name=tb_name,
                cols=cols_str
            )

        sql_val = sql_cols(df_tmp, "format")
        vals = tuple([sql_val % x for x in df_tmp.to_dict("records")])
        sql_vals = "VALUES ({x})".format(x=vals[0])
        for i in range(1, len(vals)):
            sql_vals += ", ({x})".format(x=vals[i])
        sql_vals = sql_vals.replace("'None'", "NULL")

        sql_main = sql_base + sql_vals
        if type == "update":
            sql_main += sql_update

        if sys.version_info.major == 2:
            sql_main = sql_main.replace("u`", "`")
        if sys.version_info.major == 3:
            sql_main = sql_main.replace("%", "%%")

        if debug is False:
            try:
                conn.execute(sql_main)
            except pymysql.err.InternalError as e:
                print("ENCOUNTERING ERROR: {e}, RETRYING".format(e=e))
                time.sleep(10)
                conn.execute(sql_main)
        else:
            sqls.append(sql_main)
    if debug:
        return sqls


def delete(tb_name, conn, dataframe, chunksize=10000):
    """

    Args:
        tb_name:
        conn:
        dataframe:
        chunksize:

    Returns:

    """
    dataframe = dataframe.dropna().drop_duplicates()
    for i in range(0, len(dataframe), chunksize):
        df = dataframe[i: i + chunksize]
        condition = generate_condition(df)
        sql = "DELETE FROM {tb} WHERE ({criterion})".format(
            tb=tb_name, criterion=condition
        )
        conn.execute(sql)


def generate_condition(dataframe):
    dataframe = dataframe.dropna()
    dataframe = dataframe.drop_duplicates()  # 避免由于重复条件应用于删除sql时导致
    cols = dataframe.columns
    if len(cols) == 1:
        tmp = str(tuple(dataframe[cols[0]].apply(lambda x: str(x)).tolist()))
        if len(dataframe) == 1:
            tmp = tmp[:-2] + ")"
        condition = "{col} IN {val}".format(col=cols[0], val=tmp)
    else:
        s = ""
        for i, col in enumerate(cols):
            if i > 0:
                s += " AND `{k}` = ".format(k=col) + "'{" + str(i) + "}'"
            else:
                s = "`{k}` = ".format(k=col) + "'{" + str(i) + "}'"
        s = "(" + s + ")"
        conditions = []
        for val in dataframe.as_matrix():
            tmp = s.format(*val)
            conditions.append(tmp)

        condition = " OR ".join(conditions)
    return condition




# 工具函数
def format_list(ids):
    """
    格式化,以满足sql语句中 where ids in ("1", "2")
    将['JR00001', 'JR00002'] 列表格式化为str: "('JR00001','JR00002')"
    :param ids: type: list | set | str
    :return:
    """
    if isinstance(ids, str):
        ids = [ids]

    if isinstance(ids, list) or isinstance(ids, set):
        _ids = ["'{}'".format(x) for x in ids]
        if len(ids) > 0:
            str_ids = "(" + ','.join(_ids) + ")"
        else:
            str_ids = "('')"
        return str_ids

    else:
        return ids

数据库配置

from ..utils.sql_engine import _engine
from life_example.settings import MYSQL_USER,MYSQL_PASSWD,MYSQL_HOST,MYSQL_PORT,MYSQL_DBNAME

def engine_bazi():
    db = '{}:{}@{}:{}'.format(MYSQL_USER, MYSQL_PASSWD, MYSQL_HOST, MYSQL_PORT)
    return _engine(db, MYSQL_DBNAME)

数据库引擎

from sqlalchemy import create_engine

def _engine(connnect_info, db_name):
    """
    :param connnect_info:连接信息,主机端口,用户密码
    :param db_name: 数据库名
    :return:
    """
    return create_engine("mysql+pymysql://{}/{}".format(connnect_info, db_name), connect_args={"charset": "utf8"})

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

warrah

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值