python爬虫

爬取某高校教师信息

请正当合理使用,否则请关闭此博客

代码如下

# -*- coding:utf-8 -*-
'''crawhitteacherspersonalinformation
    auther      :       wud
    date        :       2017/12/4
    version     :       1.0
'''
# encoding:utf-8
import urllib2
import random
import MySQLdb
import requests
import re
from time import sleep

def main():
    db = MySQLdb.connect("***.*.***.***", "*****", "*******", "***", charset="utf8")
    cursor = db.cursor()
    f = open("url.txt", 'r')
    flag = 1
    while (flag <= 1):
        url = f.readline()[:-1]
        print flag
        print url
        try:
            my_headers = ["Mozilla/5.0 (Windows NT 6.3; Win64; x64) 。。。 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"]
            r = getContent(url, my_headers)
            print r

            name_keyword =re.compile(r'''<span class="name tit36">(.*?)</span>''', re.U|re.S)
            name = ''.join(name_keyword.findall(r))

            telphone_keyword = re.compile(r'''电话</span></span><span class="col-lg-3 col-sm-3" style="padding-right:10px;">(.*?)</span>''', re.U|re.S)
            telphone = ''.join(telphone_keyword.findall(r))
            print telphone

            email_keyword = re.compile(r'''邮箱</span></span><span class="col-lg-3 col-sm-3" style="padding-right:10px;"><a href="(.*?)" title''', re.U|re.S)
            email = ''.join(email_keyword.findall(r))
            print email

            #print name
            address_keyword = re.compile(r'''地址</span></span><span class="col-lg-3 col-sm-3" style="padding-right:10px;">(.*?)</span>''',re.U|re.S)
            address = ''.join(address_keyword.findall(r))
            print address

            picture_keyword = re.compile(r'''style="cursor:default;"><img src=(.*?) style="width:191px;height:191px;"></a>''', re.U|re.S)
            picture = ''.join(picture_keyword.findall(r))
            #print type(picture)
            picture = "http://homepage.hit.edu.cn" + picture[1:-1]

            #print picture
            zhicheng_keyword = re.compile(r''' <div class="show-js text14"><span title="职称">(.*?)</span>''', re.U|re.S)
            zhicheng = ''.join(zhicheng_keyword.findall(r))

            #print zhicheng
            #major_keyword = re.compile(r'''<span title="职称">(.*?)</span>''', re.U|re.S)
            #major = major_keyword.findall(r)
            #print major

            xueyuan_keyword = re.compile(r'''目前就职</td><td width="300px" class="show-text1">(.*?)</td>''', re.U|re.S)
            xueyuan = ''.join(xueyuan_keyword.findall(r))

            #print xueyuan
            information = str(name) + " " +  " " + str(zhicheng) + " "  + str(xueyuan) + " " + email + " " + telphone  + " " + address +  " " + str(picture)
            f0 = open("hitteacherspersonalinfromation_diff.txt",'a+')
            #f2 = open("hitteacherspersonalinfromation_picture.txt", 'r+')
            f6 = open("hitteacherspersonalinfromation.txt", 'a+')
            #print >> f0, name
            #print >> f2, picture
            print >> f6, information
            print information
            flag+=1
            #f0.close()
            #f2.close()
            f6.close()
            if xueyuan != None and zhicheng != None:
                cursor.execute('INSERT INTO PersonalInfo_1(name,zhicheng,xueyuan,picture,tel,email,address)VALUES (%s,%s,%s,%s,%s,%s,%s)',(name, zhicheng, xueyuan, picture,telphone,email,address))
                db.commit()
                print "SAVE IT!"
            else :
                print "SAVE FAIL"
                print >>f0, url
                f0.close()
        except:
            flag += 1
            f7 = open("fail.txt", 'r+')
            print >> f7, url
            f7.close()
            print "requests error"
            pass
    f.close()
    print "END"


def getContent(url, headers):
    random_header = random.choice(headers)
    req = urllib2.Request(url)
    req.add_header("User-Agent", random_header)
    req.add_header("GET", url)
    req.add_header("Host", "blog.csdn.net")
    req.add_header("Referer", "http://www.csdn.net/")
    content = urllib2.urlopen(req).read()
    return content

if __name__=='__main__':
    main()
  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值