- 博客(24)
- 收藏
- 关注
原创 爬取小说网站小说信息以及内容
import requests,random,re,timefrom bs4 import BeautifulSoupfrom tomorrow import threadsurls=['http://www.aiquxs.com/modules/article/toplist.php?sort=news&page=%d'%i for i in range(30,2773)]#全部链接
2016-12-31 14:16:41
2349
原创 爬取豆瓣书籍 --开始使用多线程
from bs4 import BeautifulSoupimport requests,random,timefrom tomorrow import threadss=requests.Session()#设置随机头:def h():header = [{"User-Agent": "Mozilla/4.0 (compatible; MSIE 6.0; Window
2016-12-31 14:07:33
398
原创 知乎2
import requests,re,json,os,random,timefrom bs4 import BeautifulSoupdef header():headers = [{"User-Agent": "Mozilla/5.0 (iPhone; U; CPU iPhone OS) (compatible; Googlebot-Mobile/2.1; http://www.
2016-12-29 20:30:54
545
原创 知乎1
import requests,re,json,os,random,timefrom bs4 import BeautifulSoupwith open('d://zhihu//zimei//followers_info.txt','r') as f: f=f.read().strip().split('\n')with open('d://heade
2016-12-29 20:30:19
446
原创 淘女郎
import re,os,random,time,requestsfrom urllib import requestfrom bs4 import BeautifulSoupdef h(url): head = [ {"User-Agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Aco
2016-12-29 20:29:37
580
原创 淘宝——手机
import re,requests,os,json,timefrom bs4 import BeautifulSouph={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.103 Safari/537.36'}s=requ
2016-12-29 20:28:39
534
原创 天猫
import re,requests,os,json,timefrom bs4 import BeautifulSoupfrom selenium import webdriverfrom selenium.webdriver.common.desired_capabilities import DesiredCapabilitiesdcap = dict(DesiredCapab
2016-12-29 20:27:55
387
原创 诸天至尊小说
import re,os,randomfrom urllib import requestfrom bs4 import BeautifulSoupfrom functools import reduceurl='http://www.aiquxs.com/read/50/50271/index.html'req=request.Request(url)res=reques
2016-12-29 20:26:39
567
1
原创 美女图片
from urllib import requestimport re,timeclass spidermain(object): def __init__(self): self.urls=urldownload() self.parser=htmlparser() self.output=output() def
2016-12-29 20:26:00
2786
原创 简书
from selenium import webdriver from bs4 import BeautifulSoupimport requests,re,os,timedriver = webdriver.PhantomJS() urls=["http://www.jianshu.com/search?q=Python+selenium+PhantomJS&page=%d&
2016-12-29 20:24:40
369
原创 豆瓣分类排行电影信息
import requests,os,re,time,randomfrom bs4 import BeautifulSoupdef header(): headers = [ {"User-Agent": "Mozilla/5.0 (Android; Mobile; rv:27.0) Gecko/27.0 Firefox/27.0"}, {"User-Agen
2016-12-29 20:24:03
472
原创 获取豆瓣分类排行电影的URL
from bs4 import BeautifulSoupfrom selenium import webdriverimport re,timedriver = webdriver.Chrome(executable_path=r'E:\rj\Chrome\chromedriver.exe')with open('f://ty.txt','r') as f:f=f.read(
2016-12-29 20:23:18
1029
原创 获取煎蛋网图片url
import requests,re,random,timefrom bs4 import BeautifulSoupurls=['http://jandan.net/ooxx/page-%d#comments'%i for i in range(1501,2281)]def header(): headers = [ {"User-Agent": "Moz
2016-12-29 20:22:42
606
原创 代理IP
from selenium import webdriverimport requests,refrom bs4 import BeautifulSoupurls=['http://www.kuaidaili.com/free/inha/%d/'%i for i in range(1,33)]def paser(url): print(url) m=reques
2016-12-29 20:21:52
322
原创 猎聘
import requests,os,re,time,random,asynciofrom bs4 import BeautifulSoupimport urllib.parse#from selenium import webdriverdef header(): headers = [ {"User-Agent": "Mozilla/5.0 (Android
2016-12-29 20:21:19
381
原创 拉钩网
import requests,re,time,random,os,pinyinfrom bs4 import BeautifulSoupm=requests.get('http://www.lagou.com/').content.decode('utf-8')bs=BeautifulSoup(m,'lxml')n=bs.find("div","mainNavs").text
2016-12-29 20:20:30
323
原创 牛仔网股评
import requests,os,re,random,timefrom bs4 import BeautifulSoupdef header(): headers = [ {"User-Agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.
2016-12-29 20:19:43
494
原创 联盟王者
import os,requests,re,randomfrom bs4 import BeautifulSoupdef header(): headers = [ {"User-Agent": "Mozilla/5.0 (Android; Mobile; rv:27.0) Gecko/27.0 Firefox/27.0"}, {"User-Agent": "
2016-12-29 20:19:00
400
原创 登录知乎
import requestswith open('d://cookie.txt','r') as f:cookies={}for line in f.read().split(';'):name,v=line.strip().split('=',1)cookies[name]=vs=requests.Session()url='http://www.z
2016-12-29 20:18:29
1056
原创 meizitu美女URL
#http://www.meizitu.com/a/4000.htmlimport urllib.request,socket,re,sys,osf=open('d:\\3.json','a')a=set([])for i in range(5366,5380): url='http://www.meizitu.com/a/'+str(i)+'.html' pr
2016-12-29 20:17:46
4220
原创 爬CSDN博客
import urllib.request,re,time,random,gzipfrom bs4 import BeautifulSoupdef savefile(data,idx): path='d:\\u\\o_'+str(idx+1)+'.txt' file=open(path,'wb') page='当前页:'+str(idx+1)+'\n'
2016-12-29 20:17:04
366
原创 first新人报道
import urllib.requestimport io ,sys#sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8') sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')url="http://www.baidu.co
2016-12-29 20:16:15
145
原创 urllib 小白
import urllib.requestimport io ,sys#sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8') sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')def saveFile(data):
2016-12-29 20:15:10
117
空空如也
空空如也
TA创建的收藏夹 TA关注的收藏夹
TA关注的人