数据库表语句:
CREATE TABLE `lagou` (
`id` int(8) NOT NULL AUTO_INCREMENT,
`l_url` varchar(255) NOT NULL,
`companyurl` varchar(255) NOT NULL,
`companuname` varchar(255) NOT NULL,
`companyintro` blob NOT NULL,
`companyproducte` blob NOT NULL,
`companyintro2` blob NOT NULL,
`manage` blob NOT NULL,
`touzi_jigou` blob NOT NULL,
`companyarea` blob NOT NULL,
KEY `id` (`id`) USING BTREE
) ENGINE=MyISAM AUTO_INCREMENT=110039 DEFAULT CHARSET=utf8;
因为拉勾的各种限制所以就只能是单线程采集了 频率也不高,有兴趣的同学可以加代理或者动态拨号,改成多线程的,因为有频率限制我就直接单线程了
python代码:
#coding:utf-8
import requests
import Queue
import threading
from lxml import etree
import MySQLdb
import re
import time
import random
conn= MySQLdb.connect(
host='localhost',
port = 3306,
user='root',
passwd='root',
db ='ceshi',
charset="utf8",
)
cur = conn.cursor()
repace=re.compile(r'leaders":\[(.*?)\]')
repace1=re.compile(r'history":\[(.*?)\]')
repace2=re.compile(r'"companyProfile\"\:\"(.*?)\"')
threadLock = threading.Lock()
#html=requests.get('https://www.lagou.com/gongsi/10.html')
USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
]
q_queue=Queue.Queue()
def crawl():
while not q_queue.empty():
url=q_queue.get()
time.sleep(4)
header={'User-Agent': random.choice(USER_AGENTS)}
try:
#req=requests.get(url,headers=header,timeout=16,verify=False, allow_redirects=False)
req=requests.get(url,headers=header,timeout=16,allow_redirects=False)
except Exception,e3:
print e3
if req.status_code==200:
l_url=req.url
print l_url
content=req.content.decode('utf-8')
content_xpath=etree.HTML(content)
try:
title=content_xpath.xpath('//head/title')[0].text.strip().replace('\\','1')
urllist=content_xpath.xpath('//h1/a/@href')[0].strip().replace('\\','1')
intro=content_xpath.xpath('//div[@class="company_word"]')[0].text.strip().replace('\\','1')
producklist=content_xpath.xpath('//*[@id="company_products"]/div[2]/div[@class="product_content product_item clearfix"]/div/h4/div/a[1]')
product=""
for i in producklist:
product=i.text.strip()+"-"+product
productend=product.replace('\\','1')
CompanyProfile=repace2.findall(content)[0].strip().replace('\\','1')
manage=repace.findall(content)[0].strip().replace('\\','1')
touzi_jigou=repace1.findall(content)[0].strip()
area=content_xpath.xpath('//p[@class="mlist_li_desc"]')
area1=""
for ii in area:
area1=ii.text.strip()+"_"+area1
area2=area1.replace('\\','1')
except Exception,e2:
print e2
sql="insert into lagou(id,l_url,companyurl,companuname,companyintro,companyproducte,companyintro2,manage,touzi_jigou,companyarea) values(id,'%s','%s','%s','%s','%s','%s','%s','%s','%s')" % (l_url,urllist,title,intro,productend,CompanyProfile,manage,touzi_jigou,area2)
threadLock.acquire()
try:
cur.execute(sql)
conn.commit()
except Exception,e1:
print e1
#print sql
#conn.rollback()
threadLock.release()
#title=content_xpath.xpath('//head/title')[0].text
#print title
else:
pass
if __name__=="__main__":
for i in range(1722,130000):
url="https://www.lagou.com/gongsi/"+str(i)+".html"
q_queue.put(url)
#for ii in range(1):
t=threading.Thread(target=crawl,)
t.start()
t.join()
time.sleep(1)
采集了10多W数据
采集分割的内容很多 其中所有管理者的信息 学历工作经历什么都有,该公司历史的投资情况都有
需要数据的可以私密我