python爬虫

最新推荐文章于 2023-08-21 08:00:00 发布

WjXIIIIIIIIII

最新推荐文章于 2023-08-21 08:00:00 发布

阅读量477

点赞数

分类专栏： python 爬虫文章标签： python 爬虫

本文链接：https://blog.csdn.net/WU_DENG9495/article/details/78735046

版权

python 同时被 2 个专栏收录

18 篇文章 0 订阅

订阅专栏

爬虫

3 篇文章 0 订阅

订阅专栏

爬取某高校教师信息

请正当合理使用，否则请关闭此博客

代码如下

# -*- coding:utf-8 -*-
'''crawhitteacherspersonalinformation
    auther      :       wud
    date        :       2017/12/4
    version     :       1.0
'''
# encoding:utf-8
import urllib2
import random
import MySQLdb
import requests
import re
from time import sleep

def main():
    db = MySQLdb.connect("***.*.***.***", "*****", "*******", "***", charset="utf8")
    cursor = db.cursor()
    f = open("url.txt", 'r')
    flag = 1
    while (flag <= 1):
        url = f.readline()[:-1]
        print flag
        print url
        try:
            my_headers = ["Mozilla/5.0 (Windows NT 6.3; Win64; x64) 。。。 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"]
            r = getContent(url, my_headers)
            print r

            name_keyword =re.compile(r'''<span class="name tit36">(.*?)</span>''', re.U|re.S)
            name = ''.join(name_keyword.findall(r))

            telphone_keyword = re.compile(r'''电话</span></span><span class="col-lg-3 col-sm-3" style="padding-right:10px;">(.*?)</span>''', re.U|re.S)
            telphone = ''.join(telphone_keyword.findall(r))
            print telphone

            email_keyword = re.compile(r'''邮箱</span></span><span class="col-lg-3 col-sm-3" style="padding-right:10px;"><a href="(.*?)" title''', re.U|re.S)
            email = ''.join(email_keyword.findall(r))
            print email

            #print name
            address_keyword = re.compile(r'''地址</span></span><span class="col-lg-3 col-sm-3" style="padding-right:10px;">(.*?)</span>''',re.U|re.S)
            address = ''.join(address_keyword.findall(r))
            print address

            picture_keyword = re.compile(r'''style="cursor:default;"><img src=(.*?) style="width:191px;height:191px;"></a>''', re.U|re.S)
            picture = ''.join(picture_keyword.findall(r))
            #print type(picture)
            picture = "http://homepage.hit.edu.cn" + picture[1:-1]

            #print picture
            zhicheng_keyword = re.compile(r''' <div class="show-js text14"><span title="职称">(.*?)</span>''', re.U|re.S)
            zhicheng = ''.join(zhicheng_keyword.findall(r))

            #print zhicheng
            #major_keyword = re.compile(r'''<span title="职称">(.*?)</span>''', re.U|re.S)
            #major = major_keyword.findall(r)
            #print major

            xueyuan_keyword = re.compile(r'''目前就职</td><td width="300px" class="show-text1">(.*?)</td>''', re.U|re.S)
            xueyuan = ''.join(xueyuan_keyword.findall(r))

            #print xueyuan
            information = str(name) + " " +  " " + str(zhicheng) + " "  + str(xueyuan) + " " + email + " " + telphone  + " " + address +  " " + str(picture)
            f0 = open("hitteacherspersonalinfromation_diff.txt",'a+')
            #f2 = open("hitteacherspersonalinfromation_picture.txt", 'r+')
            f6 = open("hitteacherspersonalinfromation.txt", 'a+')
            #print >> f0, name
            #print >> f2, picture
            print >> f6, information
            print information
            flag+=1
            #f0.close()
            #f2.close()
            f6.close()
            if xueyuan != None and zhicheng != None:
                cursor.execute('INSERT INTO PersonalInfo_1(name,zhicheng,xueyuan,picture,tel,email,address)VALUES (%s,%s,%s,%s,%s,%s,%s)',(name, zhicheng, xueyuan, picture,telphone,email,address))
                db.commit()
                print "SAVE IT!"
            else :
                print "SAVE FAIL"
                print >>f0, url
                f0.close()
        except:
            flag += 1
            f7 = open("fail.txt", 'r+')
            print >> f7, url
            f7.close()
            print "requests error"
            pass
    f.close()
    print "END"


def getContent(url, headers):
    random_header = random.choice(headers)
    req = urllib2.Request(url)
    req.add_header("User-Agent", random_header)
    req.add_header("GET", url)
    req.add_header("Host", "blog.csdn.net")
    req.add_header("Referer", "http://www.csdn.net/")
    content = urllib2.urlopen(req).read()
    return content

if __name__=='__main__':
    main()

WjXIIIIIIIIII

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
python爬虫

爬取某高校教师信息请正当合理使用，否则请关闭此博客代码如下# -*- coding:utf-8 -*-'''crawhitteacherspersonalinformation auther : wud date : 2017/12/4 version : 1.0'''# encoding:utf-8
复制链接

扫一扫

专栏目录