拉勾网认证企业信息采集

最新推荐文章于 2020-11-29 16:23:44 发布

huayanqiaq

最新推荐文章于 2020-11-29 16:23:44 发布

阅读量808

点赞数

分类专栏： python 文章标签：拉勾网

本文链接：https://blog.csdn.net/huayanqiaq/article/details/53958663

版权

python 专栏收录该内容

17 篇文章 0 订阅

订阅专栏

数据库表语句：

CREATE TABLE `lagou` (
  `id` int(8) NOT NULL AUTO_INCREMENT,
  `l_url` varchar(255) NOT NULL,
  `companyurl` varchar(255) NOT NULL,
  `companuname` varchar(255) NOT NULL,
  `companyintro` blob NOT NULL,
  `companyproducte` blob NOT NULL,
  `companyintro2` blob NOT NULL,
  `manage` blob NOT NULL,
  `touzi_jigou` blob NOT NULL,
  `companyarea` blob NOT NULL,
  KEY `id` (`id`) USING BTREE
) ENGINE=MyISAM AUTO_INCREMENT=110039 DEFAULT CHARSET=utf8;

因为拉勾的各种限制所以就只能是单线程采集了频率也不高，有兴趣的同学可以加代理或者动态拨号，改成多线程的，因为有频率限制我就直接单线程了

python代码：

#coding:utf-8
import requests
import Queue
import threading
from lxml import etree
import MySQLdb
import re
import time
import random

conn= MySQLdb.connect(
        host='localhost',
        port = 3306,
        user='root',
        passwd='root',
        db ='ceshi',
        charset="utf8",
        )
cur = conn.cursor()

repace=re.compile(r'leaders":\[(.*?)\]')
repace1=re.compile(r'history":\[(.*?)\]')
repace2=re.compile(r'"companyProfile\"\:\"(.*?)\"')
threadLock = threading.Lock()
#html=requests.get('https://www.lagou.com/gongsi/10.html')
USER_AGENTS = [
  "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
  "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
  "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
  "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
  "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
  "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
  "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
  "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
  "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
  "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
  "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
  "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
  "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
  "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
  "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
]

q_queue=Queue.Queue()
def crawl():
    while not q_queue.empty():
        url=q_queue.get()
        time.sleep(4)
        header={'User-Agent': random.choice(USER_AGENTS)}
        try:    
            #req=requests.get(url,headers=header,timeout=16,verify=False, allow_redirects=False)
            req=requests.get(url,headers=header,timeout=16,allow_redirects=False)
        except Exception,e3:
            print e3
        if req.status_code==200:
            l_url=req.url
            print l_url
            content=req.content.decode('utf-8')
            content_xpath=etree.HTML(content)
            try:
                title=content_xpath.xpath('//head/title')[0].text.strip().replace('\\','1')
                urllist=content_xpath.xpath('//h1/a/@href')[0].strip().replace('\\','1')
                intro=content_xpath.xpath('//div[@class="company_word"]')[0].text.strip().replace('\\','1')

                producklist=content_xpath.xpath('//*[@id="company_products"]/div[2]/div[@class="product_content product_item clearfix"]/div/h4/div/a[1]')
                product=""
                for i in producklist:
                    product=i.text.strip()+"-"+product
                productend=product.replace('\\','1')
                CompanyProfile=repace2.findall(content)[0].strip().replace('\\','1')
                manage=repace.findall(content)[0].strip().replace('\\','1')
                touzi_jigou=repace1.findall(content)[0].strip()
                area=content_xpath.xpath('//p[@class="mlist_li_desc"]')
                area1=""
                for ii in area:
                    area1=ii.text.strip()+"_"+area1
                area2=area1.replace('\\','1')
            except Exception,e2:
                print e2


            sql="insert into lagou(id,l_url,companyurl,companuname,companyintro,companyproducte,companyintro2,manage,touzi_jigou,companyarea) values(id,'%s','%s','%s','%s','%s','%s','%s','%s','%s')" % (l_url,urllist,title,intro,productend,CompanyProfile,manage,touzi_jigou,area2)
            threadLock.acquire()
            try:
                cur.execute(sql)
                conn.commit()
            except Exception,e1:
                print e1
                #print sql
                #conn.rollback()
            threadLock.release()
            #title=content_xpath.xpath('//head/title')[0].text
            #print title
        else:
            pass

if __name__=="__main__":
    for i in range(1722,130000):
        url="https://www.lagou.com/gongsi/"+str(i)+".html"
        q_queue.put(url)
    #for ii in range(1):
    t=threading.Thread(target=crawl,)
    t.start()
    t.join()
    time.sleep(1)