爬虫的代码比较简单,因为没有什么反爬,我比较喜欢用PyQuery,使用起来像jquery一样方便
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import scrapy
from pyquery import PyQuery as pq
import re
from life_example.items import LifeExampleItem
class S128Spider(scrapy.Spider):
name = "s128"
start_urls = [
"http://www.s128.com/lb.php"
]
page = 1
def parse(self,response):
soup = pq(response.body_as_unicode())
lis = soup('.fly-list li')
for li in lis:
li_ = soup(li)
href = li_('h2>a').attr('href')
title = li_('h2>a').text()
url = 'http://www.s128.com/'+ href
yield scrapy.Request(url=url,callback=self.parse_detail)
if self.page<=353:
self.page = self.page + 1
url = 'http://www.s128.com/lb.php?t=&p='+str(self.page)
yield scrapy.Request(url=url,callback=self.parse)
def parse_detail(self,response):
soup = pq(response.body_as_unicode())
title = soup('.moudle-card h1').text()
item = LifeExampleItem()
item['year'] = title[0:2]
item['month'] = title[2:4]
item['day'] = title[4:6]
item['hour'] = title[6:8]
item['bazi'] = item['year'] + ' ' + item['month'] + ' '+ item['day'] + ' '+ item['hour']
if title[9:11] == '男命':
item['sex'] = '01'
else:
item['sex'] = '00'
item['name'] = title[12:]
content = soup('.con.layui-text').text()
item['concise_evalute'] = content.strip()
yield item
管道代码
class LifeExamplePipeline(object):
# def open_spider(self,spider):
# self.out = open('s128.csv','a',encoding='utf-8')
# self.cr = csv.writer(self.out)
def process_item(self, item, spider):
if type(item) == LifeExampleItem:
item['id'] = str.replace(str(uuid.uuid1()),'-','')
to_sql('bazi_person',engine_bazi(),dataframe=pd.DataFrame([item]))
写入mysql数据的代码
import os
import pandas as pd
import shutil
import sys
import re
import time
import pymysql
def sql_cols(df, usage="sql"):
cols = tuple(df.columns)
if usage == "sql":
cols_str = str(cols).replace("'", "`")
if len(df.columns) == 1:
cols_str = cols_str[:-2] + ")" # to process dataframe with only one column
return cols_str
elif usage == "format":
base = "'%%(%s)s'" % cols[0]
for col in cols[1:]:
base += ", '%%(%s)s'" % col
return base
elif usage == "values":
base = "%s=VALUES(%s)" % (cols[0], cols[0])
for col in cols[1:]:
base += ", `%s`=VALUES(`%s`)" % (col, col)
return base
def to_sql(tb_name, conn, dataframe, type="update", chunksize=2000, debug=False):
"""
Dummy of pandas.to_sql, support "REPLACE INTO ..." and "INSERT ... ON DUPLICATE KEY UPDATE (keys) VALUES (values)"
SQL statement.
Args:
tb_name: str
Table to insert get_data;
conn:
DBAPI Instance
dataframe: pandas.DataFrame
Dataframe instance
type: str, optional {"update", "replace", "ignore"}, default "update"
Specified the way to update get_data. If "update", then `conn` will execute "INSERT ... ON DUPLICATE UPDATE ..."
SQL statement, else if "replace" chosen, then "REPLACE ..." SQL statement will be executed; else if "ignore" chosen,
then "INSERT IGNORE ..." will be excuted;
chunksize: int
Size of records to be inserted each time;
**kwargs:
Returns:
None
"""
if len(dataframe) == 0:
return
tb_name = ".".join(["`" + x + "`" for x in tb_name.split(".")])
df = dataframe.copy(deep=False)
df = df.fillna("None")
df = df.applymap(lambda x: re.sub('([\'\"\\\])', '\\\\\g<1>', str(x)))
cols_str = sql_cols(df)
sqls = []
for i in range(0, len(df), chunksize):
# print("chunk-{no}, size-{size}".format(no=str(i/chunksize), size=chunksize))
df_tmp = df[i: i + chunksize]
if type == "replace":
sql_base = "REPLACE INTO {tb_name} {cols}".format(
tb_name=tb_name,
cols=cols_str
)
elif type == "update":
sql_base = "INSERT INTO {tb_name} {cols}".format(
tb_name=tb_name,
cols=cols_str
)
sql_update = "ON DUPLICATE KEY UPDATE {0}".format(
sql_cols(df_tmp, "values")
)
elif type == "ignore":
sql_base = "INSERT IGNORE INTO {tb_name} {cols}".format(
tb_name=tb_name,
cols=cols_str
)
sql_val = sql_cols(df_tmp, "format")
vals = tuple([sql_val % x for x in df_tmp.to_dict("records")])
sql_vals = "VALUES ({x})".format(x=vals[0])
for i in range(1, len(vals)):
sql_vals += ", ({x})".format(x=vals[i])
sql_vals = sql_vals.replace("'None'", "NULL")
sql_main = sql_base + sql_vals
if type == "update":
sql_main += sql_update
if sys.version_info.major == 2:
sql_main = sql_main.replace("u`", "`")
if sys.version_info.major == 3:
sql_main = sql_main.replace("%", "%%")
if debug is False:
try:
conn.execute(sql_main)
except pymysql.err.InternalError as e:
print("ENCOUNTERING ERROR: {e}, RETRYING".format(e=e))
time.sleep(10)
conn.execute(sql_main)
else:
sqls.append(sql_main)
if debug:
return sqls
def delete(tb_name, conn, dataframe, chunksize=10000):
"""
Args:
tb_name:
conn:
dataframe:
chunksize:
Returns:
"""
dataframe = dataframe.dropna().drop_duplicates()
for i in range(0, len(dataframe), chunksize):
df = dataframe[i: i + chunksize]
condition = generate_condition(df)
sql = "DELETE FROM {tb} WHERE ({criterion})".format(
tb=tb_name, criterion=condition
)
conn.execute(sql)
def generate_condition(dataframe):
dataframe = dataframe.dropna()
dataframe = dataframe.drop_duplicates() # 避免由于重复条件应用于删除sql时导致
cols = dataframe.columns
if len(cols) == 1:
tmp = str(tuple(dataframe[cols[0]].apply(lambda x: str(x)).tolist()))
if len(dataframe) == 1:
tmp = tmp[:-2] + ")"
condition = "{col} IN {val}".format(col=cols[0], val=tmp)
else:
s = ""
for i, col in enumerate(cols):
if i > 0:
s += " AND `{k}` = ".format(k=col) + "'{" + str(i) + "}'"
else:
s = "`{k}` = ".format(k=col) + "'{" + str(i) + "}'"
s = "(" + s + ")"
conditions = []
for val in dataframe.as_matrix():
tmp = s.format(*val)
conditions.append(tmp)
condition = " OR ".join(conditions)
return condition
# 工具函数
def format_list(ids):
"""
格式化,以满足sql语句中 where ids in ("1", "2")
将['JR00001', 'JR00002'] 列表格式化为str: "('JR00001','JR00002')"
:param ids: type: list | set | str
:return:
"""
if isinstance(ids, str):
ids = [ids]
if isinstance(ids, list) or isinstance(ids, set):
_ids = ["'{}'".format(x) for x in ids]
if len(ids) > 0:
str_ids = "(" + ','.join(_ids) + ")"
else:
str_ids = "('')"
return str_ids
else:
return ids
数据库配置
from ..utils.sql_engine import _engine
from life_example.settings import MYSQL_USER,MYSQL_PASSWD,MYSQL_HOST,MYSQL_PORT,MYSQL_DBNAME
def engine_bazi():
db = '{}:{}@{}:{}'.format(MYSQL_USER, MYSQL_PASSWD, MYSQL_HOST, MYSQL_PORT)
return _engine(db, MYSQL_DBNAME)
数据库引擎
from sqlalchemy import create_engine
def _engine(connnect_info, db_name):
"""
:param connnect_info:连接信息,主机端口,用户密码
:param db_name: 数据库名
:return:
"""
return create_engine("mysql+pymysql://{}/{}".format(connnect_info, db_name), connect_args={"charset": "utf8"})