爬虫基本操作——从文件中批量读取需求,查询后,批量写入Excel表中

爬虫基本操作——从文件中批量读取需求,查询后,批量写入Excel表中

Mark下,防遗忘

代码如下(有注释)

# code by sunkun 20170417
# -*- coding: UTF-8 -*-
import urllib
import urllib2
import re
import csv
import codecs

file = open("categorylist.txt") # 需要查询的关键词按行放在文件里
resultFile = open('C:\Users\sunkun\Desktop\categorylist_type.csv','wb')  # 输出的文件。 
# resultFile2 = open('C:\Users\sunkun\Desktop\pinpai2.csv','a+') 
resultFile.write(codecs.BOM_UTF8) # 防止中文乱码
fieldname = ['keywords', 'type'] # 返回文件,第一列放关键词,第二列放查询到的返回类型。
writer = csv.DictWriter(resultFile, fieldnames=fieldname)
# writer.writeheader() # 是否写上表头

while 1:
    line = file.readline()
    if not line:
        break
    # print line
    keyWords = (urllib).quote(line)     # url不支持中文,进行转码操作
    # keyWords = line.encode('utf-8')
    # line = '新闻软件'
    # url = 'http://quickshare.swc.sogou/quickshared?content=Clapton&class=null&part=0&platform=iOS&location=116.326022%7C39.992892&id=1481119148606063034&naming=raphael&framework=raphael'
    urlStart = 'http://quickshare.swc.sogou/quickshared?content='
    urlEnd = '&class=null&part=0&platform=iOS&location=116.326022%7C39.992892&id=1481119148606063034&naming=raphael&framework=raphael'
    url = urlStart + keyWords + urlEnd
    try:
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        content = response.read().decode('gbk')
        pattern = re.compile('<returntype>(.*?)</returntype>', re.S)
        resultType = re.findall(pattern, content)
        # print content
        print line, resultType[0]  # 测试
        writer.writerow({'keywords':line, 'type':resultType[0]}) # 写入到文件中
        # singleResult = ""+line+resultType[0]
        # singleResult = [line, resultType[0]]
        # print singleResult
        # writer = csv.writer(resultFile)
        # resultFile.writelines(singleResult)
        # resultFile.write(line+'\t')
        # resultFile.write("\t")
        # resultFile.write(resultType[0])
        # resultFile.write("\n")
        # resultFile.write(resultType[0])

        # resultFile1.writelines(line)
        # resultFile2.writelines(resultType[0])
        # resultFile2.writelines("\n")

    except urllib2.URLError, e: # 异常处理
        if hasattr(e, "code"):
            print e.code
        if hasattr(e, "reason"):
            print e.reason

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值