Python爬虫
GrofChen
人生苦短
展开
-
python之多线程爬取绍兴e网招聘信息
import threading,urllib,jsonfrom queue import Queue from bs4 import BeautifulSoup as bsCRAWL_EXIT = Falseclass ThreadCrawl(threading.Thread): def __init__(self,threadName,pageQueue,dataQueue):...原创 2019-09-30 16:38:44 · 344 阅读 · 0 评论 -
python之多线程爬取笑话
from bs4 import BeautifulSoup as bsimport requests,json,timefile = open("duanzi.json", "a")def crawl_html(num): headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:69.0) Gecko/20...原创 2019-10-12 15:27:27 · 268 阅读 · 0 评论 -
python之爬取笑话
from bs4 import BeautifulSoup as bsimport requests,json,timefile = open("duanzi.json", "a")def crawl_html(num): headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:69.0) Gecko/20...原创 2019-10-12 14:43:57 · 410 阅读 · 0 评论 -
python之爬取百度贴吧
有道翻译import urllib.requestimport urllib.parseurl='http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule' # 有道翻译错误代码50时,去掉translate_o?的_oheaders={"User-Agent":"Mozilla/5.0 (Windo...原创 2019-09-16 16:56:23 · 305 阅读 · 0 评论 -
python之爬取邮箱电话
这里使用requests库爬取网页要比urllib库方便用finditer查询import requestsimport reurl='https://www.flyai.com/' # 带爬取的网页html=requests.get(url).text # text为转化为str数据pat='(\w+@\w+.com)|(\d{11})' # 正则模式res=re.findite...原创 2019-09-18 15:39:33 · 1999 阅读 · 0 评论 -
python之爬取绍兴e网招聘信息
from bs4 import BeautifulSoup as bsimport urllibimport reclass Spider(object): def __init__(self): print('init') self.begin_page=1 self.end_page=4 self.base_url=...原创 2019-09-25 20:06:53 · 707 阅读 · 0 评论 -
python之爬取小说
from bs4 import BeautifulSoup as bsimport urllibclass Spider(object): def __init__(self): self.base_url='https://www.biquge.com.cn' def load_page(self,url): headers={"User-Ag...原创 2019-09-27 08:52:03 · 309 阅读 · 0 评论