爬取外文工业技术期刊网页源码(自用)

#coding=utf-8
import requests
from pymongo import MongoClient
from lxml import etree
import datetime

client = MongoClient("localhost", 27017)

db = client["wanfang"]

collection=db["journal_name"]
collection1=db["journal_foreign_2014"]

db.authenticate("","")

cursor = collection.find()[1]

for i in range(2645):
    name = cursor['name_list'][i]

    num = int(cursor['number_list'][i][1:-1])
    mo = num%50
    count = 0
    if mo!=0:
        count = num/50 + 1
    else:
        count = num/50
    
    for i in range(count):

        url = "http://new.wanfangdata.com.cn/search/searchList.do?searchType=perio&pageSize=50&page="+str(i+1)+u"&searchWord= 摘要:is 起始年:2014 结束年:2014 刊名:" + name + "&order=correlation&showType=detail&isCheck=check&isHit=&isHitUnit=&firstAuthor=false&rangeParame=all"

        result = requests.post(url)
        html = result.text
        tree = etree.HTML(html)
        table = tree.xpath("//div[@class='title']/strong/following-sibling::*[1]/@href")

        for j in table:
            bson = {}
            url1 = "http://new.wanfangdata.com.cn" + j
            result1 = requests.post(url)
            html1 = result1.text
            time = datetime.datetime.now()
            bson['date'] = time
            bson['url'] = url1
            bson['html'] = html1
            bson['year'] = "2014"
            collection1.insert(bson)   

 

转载于:https://www.cnblogs.com/zhangtianyuan/p/8547559.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值