分享一套完整的Python的采集代码, 带服务器代理, 修改参数可以把采集到的数据上传到指定接口. 同时文章的图片也会自动采集.
#-*- coding: UTF-8 -*-
import urllib, urllib2
from bs4 import BeautifulSoup
import socket
import requests
import datetime, time
import random
import os,stat,pwd
serverUrl = "" #服务器URL
imgPath = "" #图片存放路径
def chown(d):
uid = pwd.getpwnam('www').pw_uid;
gid = pwd.getpwnam('www').pw_gid;
dstat = os.stat(d)
if dstat.st_uid != uid:
try:
os.chown(d, uid, gid);
except:
pass
def creatFileName(ext = "png"):
return str(int(round(time.time() * 1000))) + str(random.randint(10000,99999)) + "." + str(ext)
def creatPath(path):
isExists = os.path.exists(path)
if not isExists:
# 如果不存在则创建目录 创建目录操作函数
os.makedirs(path)
chown(path)
return True
else:
# 如果目录存在则不创建,并提示目录已存在
return False
def go():
ip_list = get_ip_list()
proxies = get_random_ip(ip_list)
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
"Opera/8.0 (Windows NT 5.1; U; en)",
"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
"Mozilla/4.