#!/usr/bin/env python
import requests
import MySQLdb
import re
import json
import threading
import time
from bs4 import BeautifulSoup
# from lxml import etree
# headers = {
# 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
# 'Accept-Encoding':'gzip, deflate, sdch, br',
# 'Accept-Language':'en-US,en;q=0.8',
# 'Cache-Control':'no-cache',
# 'Connection':'keep-alive',
# 'Cookie':'d_c0="AABCKio7GwuPTg_zEKo2OENBOST2OEj7t3M=|1483586009"; aliyungf_tc=AQAAAPcIxkOnfwYAAqDsdHeqF0qZN/4B; r_cap_id="NDI0OTAyMTBjOTNkNDZkZDg4MjMzZWM4MDVlZWJkYWI=|1484017503|eb9a8f0838c44b8987b867f3b8b1d25b9bcf8de4"; _xsrf=1ebb4fc81969fe6fad33c2453d185a3c; _zap=220c7b1f-b56f-4281-9c9f-64af404464f3; _ga=GA1.2.19806769.1484042429; s-q=%E7%9C%8B%E5%BE%85; s-i=7; sid=tb9hcp0o; s-t=autocomplete; l_n_c=1; q_c1=09e4a486f51a4f3cb3285e36005f5a49|1485152343000|1485152343000; cap_id="NzkxYTYwOTc1MmYwNDMyN2EwZWJmYmFkNjFkNmZlNmY=|1485152343|95909972859da1e7dc90468786f810374594b724"; l_cap_id="MWFiNjBlODAwYzkwNGU4OTk0MTRkY2ZkZGQwNzk3OTM=|1485152343|6e627a6ad1d25d19f7b23ea6d7206d7411f0d464"; login="NzQwZjU0MTcwNDYzNDU3NTg4ZjdmMjY2MTExYmFmOGQ=|1485152431|961e6ce88e990ac79d16f613f534d578aa9bdbe1"; __utma=51854390.19806769.1484042429.1485172331.1485225558.4; __utmb=51854390.0.10.1485225558; __utmc=51854390; __utmz=51854390.1485170263.2.2.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/topic/19776749/organize/entire; __utmv=51854390.100--|2=registration_date=20161216=1^3=entry_date=20161216=1; z_c0=Mi4wQUVBQ0RlNkhBUXNBQUVJcUtqc2JDeGNBQUFCaEFsVk4yeTJ0V0FEZnNBQ3NieDVGblowUHJELUVnX3JXV1NyanlB|1485226773|49f3c4d2e7d1b61e6a059b2167321003fd45430e',
# 'Host':'www.zhihu.com',
# 'Pragma':'no-cache',
# 'Upgrade-Insecure-Requests':'1',
# 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
# }
def run(res,header,cursor):
print "process start!\n"
try:
for r in res:
if r[2] not in spider_map:
url = 'https://www.zhihu.com/topic/' + r[2] + '/hot'
response = requests.get(url, headers = headers)
resHtml = response.text.encode('utf-8')
soup = BeautifulSoup(resHtml, 'lxml')
followed = soup.find('strong')
description = soup.find('div', 'zm-editable-content')
sql = r"update zhihu_tag set description='%s',followed='%s' where token='%s'" % (description, followed, r[2])
cursor.execute(sql)
print sql
time.sleep(3)
except Exception,e:
print e
headers = {
'Host':'www.zhihu.com',
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}
threads = []
spider_map = []
i = 0
conn = MySQLdb.connect('127.0.0.1', 'root', '123456789')
conn.autocommit(1)
cursor = conn.cursor()
res = cursor.execute('set names utf8')
res = cursor.execute('use test')
res = cursor.execute('select * from zhihu_tag')
res = cursor.fetchall()
spider_res = cursor.execute("select * from zhihu_tag where description != ''")
spider_res = cursor.fetchall()
for r in spider_res:
spider_map.append(r[2])
for i in range(0,len(res)+800):
if i%800==0 :
t = threading.Thread(target=run, args=(res[i:i+800], headers, cursor))
threads.append(t)
for i in range(0, len(threads)):
threads[i].start()
for i in range(0, len(threads)):
threads[i].join()
print "down\n"
# url = 'https://www.zhihu.com/topic/19681083/hot'
# response = requests.get(url, headers = headers)
# print response.text
# 例子2
#!/bin/bash/env python
import requests
import time
import MySQLdb
from bs4 import BeautifulSoup
def run(urls):
try:
conn.ping()
except:
conn = MySQLdb.connect('127.0.0.1', 'root', '123456789')
conn.autocommit(1)
cursor = conn.cursor()
res = cursor.execute('set names utf8')
res = cursor.execute('use test')
for url in urls:
print url
response = requests.get(url, headers = headers)
resHtml = response.text.encode('utf-8')
soup = BeautifulSoup(resHtml, 'lxml')
tags = soup.select('a[data-za-element-name]')
for i in tags:
tag = i.get_text().encode('utf-8')
token = i['data-token'].encode('utf-8')
tags_map[token] = tag
try:
sql = r"insert into zhihu_tag_py(title,token) values('%s','%s')" % (tag, token)
cursor.execute(sql)
print sql
except Exception,e:
print e
url = "https://www.zhihu.com/topic/%s/organize/entire" % token
urls.append(url)
print url
tags_map = {}
urls = []
conn = MySQLdb.connect('127.0.0.1', 'root', '123456789')
conn.autocommit(1)
cursor = conn.cursor()
res = cursor.execute('set names utf8')
res = cursor.execute('use test')
headers = {
'Accept':'*/*',
'Accept-Language':'en-US,en;q=0.8',
'Cookie':'d_c0="AABCKio7GwuPTg_zEKo2OENBOST2OEj7t3M=|1483586009"; aliyungf_tc=AQAAAPcIxkOnfwYAAqDsdHeqF0qZN/4B; r_cap_id="NDI0OTAyMTBjOTNkNDZkZDg4MjMzZWM4MDVlZWJkYWI=|1484017503|eb9a8f0838c44b8987b867f3b8b1d25b9bcf8de4"; _xsrf=1ebb4fc81969fe6fad33c2453d185a3c; _zap=220c7b1f-b56f-4281-9c9f-64af404464f3; _ga=GA1.2.19806769.1484042429; s-q=%E7%9C%8B%E5%BE%85; s-i=7; sid=tb9hcp0o; s-t=autocomplete; q_c1=09e4a486f51a4f3cb3285e36005f5a49|1485152343000|1485152343000; l_cap_id="OGMzODA3ZGJkMDg2NGY2NmExOGE2YzFkZWYzNzcyYzA=|1486088492|97402dd9f896aead9a3eed24afb33172233d374f"; cap_id="Y2I1NDMxMDczYWM2NDRjNWEzODVhMDdkYjlhMWE1ZDk=|1486088492|6c461c19070b5b9a5327659d4eea4cdc8537d13a"; login="OWYyM2FkMTlkNzI2NDI3Nzk5OGVmMWFjMjk2NDA5NDc=|1486088622|0126ec417721560f8e5eb13d27745353b6cde26a"; n_c=1; __utma=51854390.19806769.1484042429.1486094665.1486098722.5; __utmb=51854390.0.10.1486098722; __utmc=51854390; __utmz=51854390.1486087368.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=51854390.100--|2=registration_date=20161216=1^3=entry_date=20161216=1; z_c0=Mi4wQUVBQ0RlNkhBUXNBQUVJcUtqc2JDeGNBQUFCaEFsVk40WGE3V0FCdWsxaGpkTkY1YXJnQTRTd1dHXzVBc0t6ZDRR|1486105848|7ba9e77adc2be0c1966b1f890bb34f553288bd72',
'Accept-Encoding':'gzip, deflate, sdch, br',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Host':'www.zhihu.com',
'Referer':'https://www.zhihu.com/topic/19776749/organize/entire',
'Pragma':'no-cache',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}
rootUrl = 'https://www.zhihu.com/topic/19776749/organize/entire';
response = requests.get(rootUrl, headers = headers)
resHtml = response.text.encode('utf-8')
soup = BeautifulSoup(resHtml, 'lxml')
tags = soup.select('a[data-za-element-name]')
# description = soup.find('div', class_ = "zm-editable-content").get_text()
# followed = soup.select('.zm-topic-side-followers-info > a > strong')[0].get_text()
# print response.text
# print tags
for i in tags:
tag = i.get_text()
token = i['data-token']
tags_map[token] = tag
url = "https://www.zhihu.com/topic/%s/organize/entire" % token
urls.append(url)
run(urls)