,今天我们就来谈谈python在爬虫及实际应用方面的小技巧
一、python爬取wooyun镜像站文章
代码放出来吧,两个程序,一个抓来图片放在本地,一个抓来页面源码:
wooyun_spider:
[Python] 纯文本查看 复制代码#coding: utf-8
import requests
from bs4 import BeautifulSoup
import re
import os
def dopost(url):
r = requests.get(url, timeout=6)
if r.status_code == 200:
return r.content
def bsparser(content):
soup = BeautifulSoup(content, 'html.parser', from_encoding='utf-8')
bqs = soup.find_all('a', href=re.compile(r'/wooyun\-\d+\-\d+\.html'))
return bqs
dic = {}
if __name__ == '__main__':
if os.path.exists(r'd:\wooyun'):
print 'd://wooyun dir exists...spider begin'
else:
os.mkdir(r'd:\wooyun')
print 'mkdir d://wooyun...spider begin'
for i in range(1,5):
ii = '%d' % i
url = 'http://wy.hxsec.com/search?keywords=&&content_search_by=by_bugs&&search_by_html=False&&page=' + ii
content = dopost(url)
bqs = bsparser(content)
for bq in bqs:
dic[bq.get_text()] = bq['href']
#for k in dic.keys ;for v in dic.values ;
for (k, v) in dic.items():
#print k, v
try:
con = dopost(v)
file_str = 'd://wooyun/