知乎标签详情页爬虫

#!/usr/bin/env python
import requests
import MySQLdb
import re
import json
import threading
import time
from bs4 import BeautifulSoup
# from lxml import etree

# headers = {
#     'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
#     'Accept-Encoding':'gzip, deflate, sdch, br',
#     'Accept-Language':'en-US,en;q=0.8',
#     'Cache-Control':'no-cache',
#     'Connection':'keep-alive',
#     'Cookie':'d_c0="AABCKio7GwuPTg_zEKo2OENBOST2OEj7t3M=|1483586009"; aliyungf_tc=AQAAAPcIxkOnfwYAAqDsdHeqF0qZN/4B; r_cap_id="NDI0OTAyMTBjOTNkNDZkZDg4MjMzZWM4MDVlZWJkYWI=|1484017503|eb9a8f0838c44b8987b867f3b8b1d25b9bcf8de4"; _xsrf=1ebb4fc81969fe6fad33c2453d185a3c; _zap=220c7b1f-b56f-4281-9c9f-64af404464f3; _ga=GA1.2.19806769.1484042429; s-q=%E7%9C%8B%E5%BE%85; s-i=7; sid=tb9hcp0o; s-t=autocomplete; l_n_c=1; q_c1=09e4a486f51a4f3cb3285e36005f5a49|1485152343000|1485152343000; cap_id="NzkxYTYwOTc1MmYwNDMyN2EwZWJmYmFkNjFkNmZlNmY=|1485152343|95909972859da1e7dc90468786f810374594b724"; l_cap_id="MWFiNjBlODAwYzkwNGU4OTk0MTRkY2ZkZGQwNzk3OTM=|1485152343|6e627a6ad1d25d19f7b23ea6d7206d7411f0d464"; login="NzQwZjU0MTcwNDYzNDU3NTg4ZjdmMjY2MTExYmFmOGQ=|1485152431|961e6ce88e990ac79d16f613f534d578aa9bdbe1"; __utma=51854390.19806769.1484042429.1485172331.1485225558.4; __utmb=51854390.0.10.1485225558; __utmc=51854390; __utmz=51854390.1485170263.2.2.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/topic/19776749/organize/entire; __utmv=51854390.100--|2=registration_date=20161216=1^3=entry_date=20161216=1; z_c0=Mi4wQUVBQ0RlNkhBUXNBQUVJcUtqc2JDeGNBQUFCaEFsVk4yeTJ0V0FEZnNBQ3NieDVGblowUHJELUVnX3JXV1NyanlB|1485226773|49f3c4d2e7d1b61e6a059b2167321003fd45430e',
#     'Host':'www.zhihu.com',
#     'Pragma':'no-cache',
#     'Upgrade-Insecure-Requests':'1',
#     'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
# }

def run(res,header,cursor):
    print "process start!\n"
    try:
        for r in res:
            if r[2] not in spider_map:
                url = 'https://www.zhihu.com/topic/' + r[2] + '/hot'
                response = requests.get(url, headers = headers)
                resHtml = response.text.encode('utf-8')
                soup = BeautifulSoup(resHtml, 'lxml')
                followed = soup.find('strong')
                description = soup.find('div', 'zm-editable-content')
                sql = r"update zhihu_tag set description='%s',followed='%s' where token='%s'" % (description, followed, r[2])
                cursor.execute(sql)
                print sql
                time.sleep(3)
    except Exception,e:
        print e

headers = {
    'Host':'www.zhihu.com',
    'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}
threads = []
spider_map = []
i = 0

conn = MySQLdb.connect('127.0.0.1', 'root', '123456789')
conn.autocommit(1)
cursor = conn.cursor()
res = cursor.execute('set names utf8')
res = cursor.execute('use test')
res = cursor.execute('select * from zhihu_tag')
res = cursor.fetchall()

spider_res = cursor.execute("select * from zhihu_tag where description != ''")
spider_res = cursor.fetchall()
for r in spider_res:
    spider_map.append(r[2])

for i in range(0,len(res)+800):
    if i%800==0 :
        t = threading.Thread(target=run, args=(res[i:i+800], headers, cursor))
        threads.append(t)

for i in range(0, len(threads)):
    threads[i].start()

for i in range(0, len(threads)):
    threads[i].join()

print "down\n"

# url = 'https://www.zhihu.com/topic/19681083/hot'
# response = requests.get(url, headers = headers)
# print response.text


# 例子2


#!/bin/bash/env python


import requests
import time
import MySQLdb
from bs4 import BeautifulSoup


def run(urls):
    try:
        conn.ping()
    except:
        conn = MySQLdb.connect('127.0.0.1', 'root', '123456789')
        conn.autocommit(1)
        cursor = conn.cursor()
        res = cursor.execute('set names utf8')
        res = cursor.execute('use test')


    for url in urls:
        print url
        response = requests.get(url, headers = headers)
        resHtml = response.text.encode('utf-8')
        soup = BeautifulSoup(resHtml, 'lxml')
        tags = soup.select('a[data-za-element-name]')
        for i in tags:
            tag = i.get_text().encode('utf-8')
            token = i['data-token'].encode('utf-8')
            tags_map[token] = tag
            try:
                sql = r"insert into zhihu_tag_py(title,token) values('%s','%s')" % (tag, token)
                cursor.execute(sql)
                print sql
            except Exception,e:
                print e
            url = "https://www.zhihu.com/topic/%s/organize/entire" % token
            urls.append(url)
            print url


tags_map = {}
urls = []


conn = MySQLdb.connect('127.0.0.1', 'root', '123456789')
conn.autocommit(1)
cursor = conn.cursor()
res = cursor.execute('set names utf8')
res = cursor.execute('use test')


headers = {
    'Accept':'*/*',
    'Accept-Language':'en-US,en;q=0.8',
    'Cookie':'d_c0="AABCKio7GwuPTg_zEKo2OENBOST2OEj7t3M=|1483586009"; aliyungf_tc=AQAAAPcIxkOnfwYAAqDsdHeqF0qZN/4B; r_cap_id="NDI0OTAyMTBjOTNkNDZkZDg4MjMzZWM4MDVlZWJkYWI=|1484017503|eb9a8f0838c44b8987b867f3b8b1d25b9bcf8de4"; _xsrf=1ebb4fc81969fe6fad33c2453d185a3c; _zap=220c7b1f-b56f-4281-9c9f-64af404464f3; _ga=GA1.2.19806769.1484042429; s-q=%E7%9C%8B%E5%BE%85; s-i=7; sid=tb9hcp0o; s-t=autocomplete; q_c1=09e4a486f51a4f3cb3285e36005f5a49|1485152343000|1485152343000; l_cap_id="OGMzODA3ZGJkMDg2NGY2NmExOGE2YzFkZWYzNzcyYzA=|1486088492|97402dd9f896aead9a3eed24afb33172233d374f"; cap_id="Y2I1NDMxMDczYWM2NDRjNWEzODVhMDdkYjlhMWE1ZDk=|1486088492|6c461c19070b5b9a5327659d4eea4cdc8537d13a"; login="OWYyM2FkMTlkNzI2NDI3Nzk5OGVmMWFjMjk2NDA5NDc=|1486088622|0126ec417721560f8e5eb13d27745353b6cde26a"; n_c=1; __utma=51854390.19806769.1484042429.1486094665.1486098722.5; __utmb=51854390.0.10.1486098722; __utmc=51854390; __utmz=51854390.1486087368.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=51854390.100--|2=registration_date=20161216=1^3=entry_date=20161216=1; z_c0=Mi4wQUVBQ0RlNkhBUXNBQUVJcUtqc2JDeGNBQUFCaEFsVk40WGE3V0FCdWsxaGpkTkY1YXJnQTRTd1dHXzVBc0t6ZDRR|1486105848|7ba9e77adc2be0c1966b1f890bb34f553288bd72',
    'Accept-Encoding':'gzip, deflate, sdch, br',
    'Cache-Control':'no-cache',
    'Connection':'keep-alive',
    'Host':'www.zhihu.com',
    'Referer':'https://www.zhihu.com/topic/19776749/organize/entire',
    'Pragma':'no-cache',
    'Upgrade-Insecure-Requests':'1',
    'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}


rootUrl = 'https://www.zhihu.com/topic/19776749/organize/entire';
response = requests.get(rootUrl, headers = headers)
resHtml = response.text.encode('utf-8')
soup = BeautifulSoup(resHtml, 'lxml')
tags = soup.select('a[data-za-element-name]')
# description = soup.find('div', class_ = "zm-editable-content").get_text()
# followed = soup.select('.zm-topic-side-followers-info > a > strong')[0].get_text()
# print response.text
# print tags
for i in tags:
    tag = i.get_text()
    token = i['data-token']
    tags_map[token] = tag
    url = "https://www.zhihu.com/topic/%s/organize/entire" % token
    urls.append(url)


run(urls)
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值