pyspider
url = "https://www.creditchina.gov.cn/xinxigongshi/shipinanquanjianduchoujian/jieguoxiangqing/index.html?id=62335&dataType=1"
# # print("aaaaaa")
# # html = requests.get(url, headers=headers, proxies=proxies )
# # html.encoding = "utf-8"
# #
# # content = pq(str(html.text))('div.content.clearfix > div > div > div.result-tab.result-tab1')
# # print(content)
路径 C:\Users\YScredit\AppData\Roaming\Python\Python35\site-packages\pyspider
打印页面 response.text() 制定数据库和表的编码格式 CREATE DATABASE dbtest CHARACTER SET utf8 COLLATE utf8_general_ci;
CREATE TABLE tbtest(
NAME VARCHAR(111),
TYPE VARCHAR(111),
num VARCHAR(111),
address VARCHAR(111),
TIME VARCHAR(111)
)CHARACTER SET utf8 COLLATE utf8_general_ci;
实例no.1
from pyspider. libs. base_handler import *
from pyquery import PyQuery as pq
import pymysql
from six import itervalues
import time
import datetime
import re
class Handler ( BaseHandler) :
crawl_config = {
'itag' : 'bzxr-fuzhou-0.7' ,
'time_out' : 4000 ,
}
@every( minutes= 24 * 60 )
@config( age= 12 * 60 * 60 )
def on_start ( self) :
headers = {
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' ,
'Accept-Encoding' : 'gzip, deflate' ,
'Accept-Language' : 'zh-CN,zh;q=0.9' ,
'Cache-Control' : 'max-age=0' ,
'Connection' : 'keep-alive' ,
'Host' : 'fzszy.chinacourt.org' ,
'Upgrade-Insecure-Requests' : '1' ,
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36'
}
basic_url = 'http://fzszy.chinacourt.org/public/more.php?LocationID=0704000000'
self. crawl( url= basic_url, headers= headers, callback= self. basic_page)
@config( age= 12 * 60 * 60 )
def basic_page ( self, response) :
dic = {
'id' : '' ,
'name' : '' ,
'case_code' : '' ,
'name_id' : '' ,
'itype' : '' ,
'card_num' : '' ,
'business_entity' : '' ,
'sex' : '' ,
'age' : '' ,
'address' : '' ,
'execute_money_backup' : '' ,
'unexecute_money_backup' : '' ,
'reg_date' : '' ,
'court_name' : '' ,
'org_url' : '' ,
'source' : '福州法院网' ,
'case_id' : '' ,
'exp' : '1'
}
try :
basic_url = response. url
basic_trs = response. doc( 'tr.tr_odd td.td_line' )
headers = {
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' ,
'Accept-Encoding' : 'gzip, deflate' ,
'Accept-Language' : 'zh-CN,zh;q=0.9' ,
'Cache-Control' : 'max-age=0' ,
'Connection' : 'keep-alive' ,
'Host' : 'fzszy.chinacourt.org' ,
'Referer' : basic_url,
'Upgrade-Insecure-Requests' : '1' ,
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36'
}
for each_tr in basic_trs. items( ) :
org_url = each_tr( 'a' ) . attr. href
dic[ 'org_url' ] = org_url
self. crawl( url= org_url, headers= headers, callback= self. detail_page,
save= dic)
except Exception as e:
dic[ 'exp' ] = '0'
if dic[ 'name' ] != '' :
yield dic
@config( priority= 2 )
@config( age= 12 * 60 * 60 )
def detail_page ( self, response) :
response. encoding = 'gbk'
dic = response. save
try :
detail_trs = response. doc( ' span.detail_content > strong > table ' )
detail_tr = detail_trs( 'tr:not(:first-child):not(:nth-child(2))' )
for each_tr in detail_tr. items( ) :
dic[ 'id' ] = each_tr( 'td:nth-child(1)' ) . text( ) . replace( ' ' , '' )
dic[ 'name' ] = each_tr( 'td:nth-child(2)' ) . text( ) . replace( ' ' , '' )
dic[ 'name_id' ] = each_tr( 'td:nth-child(3)' ) . text( ) . replace( ' ' , '' )
dic[ 'address' ] = each_tr( 'td:nth-child(4)' ) . text( ) . replace( ' ' , '' )
dic[ 'execute_money_backup' ] = each_tr( 'td:nth-child(5)' ) . text( ) . replace( ' ' , '' )
yield dic
except Exception as e:
dic[ 'exp' ] = '0'
if dic[ 'name' ] != '' :
yield dic
def on_result ( self, result) :
for i in result:
print ( i + " " + result[ i] )
self. save_mysql( result)
if not result:
return
def save_mysql ( self, item) :
connect = pymysql. connect( host= "192.168.59.128" ,
user= "root" ,
password= "123456" ,
db= "demo1" ,
charset= "utf8" ,
use_unicode= False
)
cursor = connect. cursor( )
positionName = item[ "address" ]
positionLink = item[ 'name' ]
positionType = item[ 'source' ]
peopleNumber = item[ 'execute_money_backup' ]
workLocation = item[ 'card_num' ]
publishTime = item[ 'itype' ]
sql = "insert into tencent (positionName,positionLink,positionType," \
"peopleNumber,workLocation,publishTime)VALUES(%s,%s,%s,%s,%s,%s)"
lis = ( positionName, positionLink, positionType, peopleNumber, workLocation, publishTime)
cursor. execute( sql, lis)
connect. commit( )
cursor. close( )
connect. close( )
实例- 爬取腾讯招聘
from pyspider. libs. base_handler import *
from pyquery import PyQuery as pq
import pymysql
from six import itervalues
import time
import datetime
import re
class Handler ( BaseHandler) :
crawl_config = {
'itag' : 'bzxr-fuzhou-0.8' ,
'time_out' : 4000 ,
'proxy' : 'H21WNK49K6PFSR3P:BF2B9DDE973F0C02@http-pro.abuyun.com:9010'
}
@every( minutes= 24 * 60 )
@config( age= 12 * 60 * 60 )
def on_start ( self) :
basic_url = 'https://hr.tencent.com/position.php?&start=#a0'
self. crawl( url= basic_url, callback= self. basic_page, validate_cert= False )
@config( age= 12 * 60 * 60 )
def basic_page ( self, response) :
dic = {
'name' : '' ,
'type' : '' ,
'num' : '' ,
'address' : '' ,
'time' : '' ,
}
baseUrl = "https://hr.tencent.com/position.php?&start={}#a"
baseUrl1 = "https://hr.tencent.com/position.php?&start=10#a"
countAll = int ( response. doc( " div.pagenav> a:nth-child(10)" ) . text( ) )
num = 0
while num < countAll * 10 :
url = baseUrl. format ( num)
self. crawl( url= url, validate_cert= False , callback= self. detail_page)
num = num + 10
@config( priority= 2 )
@config( age= 12 * 60 * 60 )
def detail_page ( self, response) :
response. encoding = 'utf-8'
try :
detail_trs = response. doc( '#position > div.left.wcont_b.box > table ' )
detail_tr = detail_trs( 'tr:not(:first-child):not(:last-child)' )
for each_tr in detail_tr. items( ) :
print ( "aaaaaaaaaaaa" )
dic = {
'name' : '' ,
'type' : '' ,
'num' : '' ,
'address' : '' ,
'time' : '' ,
}
dic[ 'name' ] = each_tr( 'td:nth-child(1)' ) . text( ) . replace( ' ' , '' )
if len ( each_tr( 'td:nth-child(2)' ) . text( ) . replace( ' ' , '' ) ) == 0 :
dic[ 'type' ] = "null"
else :
dic[ 'type' ] = each_tr( 'td:nth-child(2)' ) . text( ) . replace( ' ' , '' )
dic[ 'num' ] = each_tr( 'td:nth-child(3)' ) . text( ) . replace( ' ' , '' )
dic[ 'address' ] = each_tr( 'td:nth-child(4)' ) . text( ) . replace( ' ' , '' )
dic[ 'time' ] = each_tr( 'td:nth-child(5)' ) . text( ) . replace( ' ' , '' )
yield dic
except Exception as e:
dic[ 'exp' ] = '0'
if dic[ 'name' ] != '' :
yield dic
def on_result ( self, result) :
print ( result)
if not result:
return
self. insert_text( tablename= 'tbtest' , ** result)
self. save_mysql( result)
def escape ( self, string) :
return '%s' % string
def insert_text ( self, tablename= None , ** values) :
kwargs = {
'host' : '192.168.59.128' ,
'user' : 'root' ,
'passwd' : '123456' ,
'db' : 'dbtest' ,
'charset' : 'utf8'
}
tablename = self. escape( tablename)
cnx = pymysql. connect( ** kwargs)
cur = cnx. cursor( )
if values:
_keys = "," . join( self. escape( k) for k in values)
_values = "," . join( [ '%s' , ] * len ( values) )
print ( list ( itervalues( values) ) )
sql_query = "INSERT IGNORE INTO %s (%s) values (%s)" % ( tablename, _keys, _values)
print ( sql_query, list ( itervalues( values) ) )
try :
if values:
print ( cur. execute( sql_query, list ( itervalues( values) ) ) )
cnx. commit( )
return True
except Exception as e:
print ( e)
return False
def save_mysql ( self, item) :
connect = pymysql. connect( host= "192.168.59.128" ,
user= "root" ,
password= "123456" ,
db= "dbtest" ,
charset= "utf8" ,
use_unicode= False
)
cursor = connect. cursor( )
name = item[ "name" ]
type = item[ 'type' ]
num = item[ 'num' ]
address = item[ 'address' ]
time = item[ 'time' ]
print ( "aaaaaaaaaaaaaaa" )
sql = "insert into tbtest (name,type,num," \
"address,time)VALUES(%s,%s,%s,%s,%s)"
lis = ( name, type , num, address, time)
cursor. execute( sql, lis)
connect. commit( )
cursor. close( )
connect. close( )