续上篇
Step 5-TaskSchedulerImpl
override def submitTasks(taskSet: TaskSet) {
val tasks = taskSet.tasks
logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks")
this.synchronized {
// 创建TaskSetManager
val manager = createTaskSetManager(taskSet, maxTaskFailures)
val stage = taskSet.stageId
// TaskSetManager注册到Stage的TaskSet列表,taskSetsByStageIdAndAttempt是根据StageId和尝试次数标记的TaskSetManager的HashMap
val stageTaskSets = taskSetsByStageIdAndAttempt.getOrElseUpdate(stage, new HashMap[Int, TaskSetManager])
stageTaskSets(taskSet.stageAttemptId) = manager
// 如果有冲突的TaskSet,则抛异常
val conflictingTaskSet = stageTaskSets.exists { case (_, ts) => ts.taskSet != taskSet && !ts.isZombie }
if (conflictingTaskSet) {
throw new IllegalStateException(s"more than one active taskSet for stage $stage:" + s" ${stageTaskSets.toSeq.map{_._2.taskSet.id}.mkString(",")}")
}
/* 申请任务调度,有FIFO和FAIR两种策略。根据executor的空闲资源状态及locality策略将task分配给executor。调度的数据结构封装为Pool类,对于FIFO,Pool就是TaskSetManager的队列;对于Fair,则是TaskSetManager组成的树。Pool维护TaskSet的优先级,等待executor接受资源offer(resourceOffer)的时候出列并提交Executor计算。这一步实现参见《内幕》4.3.3节*/
schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties)
// 这个Timer只是为了在一定时间内未获得调度打报警日志,不影响计算执行
if (!isLocal && !hasReceivedTask) {
starvationTimer.scheduleAtFixedRate(new TimerTask() {
override def run() {
if (!hasLaunchedTask) {
logWarning("Initial job has not accepted any resources; " + "check your cluster UI to ensure that workers are registered " + "and have sufficient resources")
} else {
this.cancel()
}
}
}, STARVATION_TIMEOUT_MS, STARVATION_TIMEOUT_MS)
}
hasReceivedTask = true
}
// 发送ReviveOffers extends CoarseGrainedClusterMessage消息给SchedulerBackend,不同资源调度模式下有不同实现。通信有Akka/Netty两种实现
// Standalone模式下是CoarseGrainedSchedulerBackend,触发Executor.launchTask,消息被Driver端的DriverEndPoint接收到
backend.reviveOffers()
}
Step 6-CoarseGrainedSchedulerBackend
override def reviveOffers() {
driverEndpoint.send(ReviveOffers)
}
AkkaRpcEnv
override def send(message: Any): Unit = {
actorRef ! AkkaMessage(message, false)
}
CoarseGrainedSchedulerBackend.DriverEndPoint
override def receive: PartialFunction[Any, Unit] = {
case ReviveOffers =>// 接收请求并开始分发
makeOffers()
……// 其他消息
}
//向executor发送一个fake offer,所谓fake指Standalone模式下spark driver自己充当资源调度器。yarn/mesos模式中,调用的是其他SchedulerBackend实现类
private def makeOffers() {
// 过滤Alive的Executor。ExecutorData封装了Executor的属性和状态信息
val activeExecutors = executorDataMap.filterKeys(executorIsAlive)
val workOffers = activeExecutors.map { case (id, executorData) =>
// 封装offer,WorkerOffer就是一个executorId,host和freeCores的pojo
new WorkerOffer(id, executorData.executorHost, executorData.freeCores)
}.toSeq
launchTasks(scheduler.resourceOffers(workOffers))
}
TaskSchedulerImpl
// 集群manager向slave发送资源offer时调用,从任务集合中按优先级取出任务匹配offer. 对每个节点通过round-robin方式分发任务,以实现任务在集群上分散
def resourceOffers(offers: Seq[WorkerOffer]): Seq[Seq[TaskDescription]] = synchronized {
// 所有提供offer的Executor标记位Alive,如果有新的Executor就注册该Executor var newExecAvail = false
for (o <- offers) {
// o.host就是ExecutorData.host,这里缓存一下
executorIdToHost(o.executorId) = o.host
// executor上task数量的计数器
executorIdToTaskCount.getOrElseUpdate(o.executorId, 0)
if (!executorsByHost.contains(o.host)) {
// 如果是新的ExecutorId,就注册一下新的Executor
executorsByHost(o.host) = new HashSet[String]()
executorAdded(o.executorId, o.host)
newExecAvail = true
}
// rack是机架,Standalone模式下没有真实的机架信息,rack都是FakeRackUtil这个类mock出来的
for (rack <- getRackForHost(o.host)) {
hostsByRack.getOrElseUpdate(rack, new HashSet[String]()) += o.host
}
}
// 随机打乱offer
val shuffledOffers = Random.shuffle(offers)
// 每个offer(executor)创建一个任务列表,TaskDescription包括任务id,TaskSet的Id,重试次数等
val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores))
val availableCpus = shuffledOffers.map(o => o.cores).toArray
// 按照优先级策略排序TaskSet
val sortedTaskSets = rootPool.getSortedTaskSetQueue
for (taskSet <- sortedTaskSets) {
logDebug("parentName: %s, name: %s, runningTasks: %s".format(taskSet.parent.name, taskSet.name, taskSet.runningTasks))
if (newExecAvail) {
// 如果有新Executor,则重新计算就近原则,具体实现在TaskSetManager. recomputeLocality
taskSet.executorAdded()
}
}
// 根据设置的就近策略和Task优先级逐个分发任务,分发就是扔tasks这个map里去
// 默认的就近策略从高优到低优依次为PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY
var launchedTask = false
for (taskSet <- sortedTaskSets; maxLocality <- taskSet.myLocalityLevels) {
do {
launchedTask = resourceOfferSingleTaskSet(taskSet, maxLocality, shuffledOffers, availableCpus, tasks)
} while (launchedTask)
}
if (tasks.size > 0) {
hasLaunchedTask = true
}
return tasks
}
// 尝试分发单个TaskSet
private def resourceOfferSingleTaskSet(
taskSet: TaskSetManager, maxLocality: TaskLocality, shuffledOffers: Seq[WorkerOffer], availableCpus: Array[Int], tasks: Seq[ArrayBuffer[TaskDescription]]) : Boolean = {
var launchedTask = false
// round-robin轮询executor
for (i <- 0 until shuffledOffers.size) {
val execId = shuffledOffers(i).executorId
val host = shuffledOffers(i).host
// cpu足够的话,就尝试分发
if (availableCpus(i) >= CPUS_PER_TASK) {
try {
// resourceOffer是向该executor分发任务,返回成功分发的任务列表
for (task <- taskSet.resourceOffer(execId, host, maxLocality)) {
tasks(i) += task
// 成功的话就记录这个分发结果
val tid = task.taskId
taskIdToTaskSetManager(tid) = taskSet
taskIdToExecutorId(tid) = execId
executorIdToTaskCount(execId) += 1
executorsByHost(host) += execId
availableCpus(i) -= CPUS_PER_TASK
assert(availableCpus(i) >= 0)
launchedTask = true
}
} catch {
case e: TaskNotSerializableException =>
logError(s"Resource offer failed, task set ${taskSet.name} was not serializable")
// Do not offer resources for this task, but don't throw an error to allow other task sets to be submitted.
return launchedTask
}
}
}
return launchedTask
}
Step 7-TaskSetManager
/**
* 响应对单个executor的offer并分配Task
* NOTE: this function is either called with a maxLocality which would be adjusted by delay scheduling algorithm or it will be with a special NO_PREF locality which will be not modified
* @param execId the executor Id of the offered resource
* @param host the host Id of the offered resource
* @param maxLocality the maximum locality we want to schedule the tasks at
*/
@throws[TaskNotSerializableException]
def resourceOffer(execId: String, host: String, maxLocality: TaskLocality.TaskLocality) : Option[TaskDescription] = {
if (!isZombie) {
val curTime = clock.getTimeMillis()
var allowedLocality = maxLocality
// 如果涉及wait-time的Locality,则根据空闲时间获取最终的locality
if (maxLocality != TaskLocality.NO_PREF) {
allowedLocality = getAllowedLocalityLevel(curTime)
if (allowedLocality > maxLocality) {
// We're not allowed to search for farther-away tasks
allowedLocality = maxLocality
}
}
// 根据调度策略,从当前TaskSetManager出列一个满足locality的Task的Id
dequeueTask(execId, host, allowedLocality) match {
case Some((index, taskLocality, speculative)) => {
// 如果找到了,则分发成功,记录相关信息并返回TaskDescription
val task = tasks(index)
val taskId = sched.newTaskId()
// Do various bookkeeping
copiesRunning(index) += 1
val attemptNum = taskAttempts(index).size
// 生成并记录TaskInfo
val info = new TaskInfo(taskId, index, attemptNum, curTime, execId, host, taskLocality, speculative)
taskInfos(taskId) = info
taskAttempts(index) = info :: taskAttempts(index)
// 更新locality相关时间戳
if (maxLocality != TaskLocality.NO_PREF) {
currentLocalityIndex = getLocalityIndex(taskLocality)
lastLaunchTime = curTime
}
// 序列化Task,包括下载依赖
val startTime = clock.getTimeMillis()
val serializedTask: ByteBuffer = try {
Task.serializeWithDependencies(task, sched.sc.addedFiles, sched.sc.addedJars, ser)
} catch {
// 如果序列化失败就直接抛异常终止计算。所以Driver会看到下面这种报错
case NonFatal(e) =>
val msg = s"Failed to serialize task $taskId, not attempting to retry it." logError(msg, e)
abort(s"$msg Exception during serialization: $e")
throw new TaskNotSerializableException(e)
}
// 如果序列化的结果太大(默认是100K),就会触发Warning,但此处不会抛出异常
if (serializedTask.limit > TaskSetManager.TASK_SIZE_TO_WARN_KB * 1024 && !emittedTaskSizeWarning) {
emittedTaskSizeWarning = true
logWarning(s"Stage ${task.stageId} contains a task of very large size " + s"(${serializedTask.limit / 1024} KB). The maximum recommended task size is " + s"${TaskSetManager.TASK_SIZE_TO_WARN_KB} KB.")
}
// 记录task状态为running
addRunningTask(taskId)
// Task命名规范
val taskName = s"task ${info.id} in stage ${taskSet.id}"
logInfo(s"Starting $taskName (TID $taskId, $host, partition ${task.partitionId}," + s"$taskLocality, ${serializedTask.limit} bytes)")
/* 向DagScheduler. eventProcessLoop发送一个BeginEvent事件,异步触发dagScheduler.handleBeginEvent,这个方法首先向监听总线(ListenerBus发送事件消息),然后回调DagScheduler.submitStage(),尝试提交剩余Stage。当然如果之前的Stage没跑完的时候回调并没有实际内容。DagScheduler的很多消息都会触发submitStage,形成一个回调回路,最终提交所有Stage
sched.dagScheduler.taskStarted(task, info)
// 返回TaskDescription
return Some(new TaskDescription(taskId = taskId, attemptNumber = attemptNum, execId,
taskName, index, serializedTask))
}
case _ =>
}
}
None
}
Task
// 序列化Task及其依赖文件
def serializeWithDependencies(task: Task[_],currentFiles: HashMap[String, Long], currentJars: HashMap[String, Long], serializer: SerializerInstance): ByteBuffer = {
val out = new ByteBufferOutputStream(4096)
val dataOut = new DataOutputStream(out)
// 序列化sc.addFiles的内容,(其实只是依赖的元数据,包括文件名和时间戳),不会下载文件本身,addJars也一样。addFiles可以包含资源文件
dataOut.writeInt(currentFiles.size)
for ((name, timestamp) <- currentFiles) {
dataOut.writeUTF(name)
dataOut.writeLong(timestamp)
}
// 序列化sc.addJars的内容
dataOut.writeInt(currentJars.size)
for ((name, timestamp) <- currentJars) {
dataOut.writeUTF(name)
dataOut.writeLong(timestamp)
}
// 最后序列化Task本身
dataOut.flush()
val taskBytes = serializer.serialize(task)
Utils.writeByteBuffer(taskBytes, out)
out.toByteBuffer
}
Step-8:CoarseGrainedSchedulerBackend
// 启动分配好资源的Task列表
private def launchTasks(tasks: Seq[Seq[TaskDescription]]) {
// 遍历所有已分发资源的TaskDesicription
for (task <- tasks.flatten) {
// 序列化TaskDescription,注意依赖已经序列化过了,这里是序列化TaskDesicription本身,有Java/Kyro两种实现,返回字节码。其中Java方式就是java.io.Serializable的序列化实现,算子本类需要实现Serializable(如果是内嵌类,宿主也需要)
val serializedTask = ser.serialize(task)
// 估算Akka消息的大小,这里如果超过上限,就直接失败。上限可在spark.akka.frameSize配置,AkkaUtils.reservedSizeBytes是固定为200K
if (serializedTask.limit >= akkaFrameSize - AkkaUtils.reservedSizeBytes) {
scheduler.taskIdToTaskSetManager.get(task.taskId).foreach { taskSetMgr =>
try {
var msg = "Serialized task %s:%d was %d bytes, which exceeds max allowed: " + "spark.akka.frameSize (%d bytes) - reserved (%d bytes). Consider increasing " + "spark.akka.frameSize or using broadcast variables for large values."
msg = msg.format(task.taskId, task.index, serializedTask.limit, akkaFrameSize, AkkaUtils.reservedSizeBytes)
taskSetMgr.abort(msg)
} catch {
case e: Exception => logError("Exception in error callback", e)
}
}
} else {
val executorData = executorDataMap(task.executorId)
// 记录executor消耗CPUS_PER_TASK来处理当前Task
executorData.freeCores -= scheduler.CPUS_PER_TASK
// 发送AKKA消息通知Executor,控制权移交到CoarseGrainedExecutorBackend
executorData.executorEndpoint.send(LaunchTask(new SerializableBuffer(serializedTask)))
}
}
}
Step-9: CoarseGrainedExecutorBackend
《内幕》6.2节开始,此处ExecutorBackend已经属于Executor进程。Executor进程在SparkContext创建的时候就已经创建(并非触发Action时才创建进程)
override def receive: PartialFunction[Any, Unit] = {
case LaunchTask(data) =>
if (executor == null) {
logError("Received LaunchTask command but executor was null")
System.exit(1)
} else {
// 反序列化TaskDescription
val taskDesc = ser.deserialize[TaskDescription](data.value)
logInfo("Got assigned task " + taskDesc.taskId)
// Executor 包含Worker节点的元数据和环境信息(SparkEnv),通过Executor启动Worker进程进行计算
executor.launchTask(this, taskId = taskDesc.taskId, attemptNumber = taskDesc.attemptNumber, taskDesc.name, taskDesc.serializedTask)
}
…… //其他消息
}
Executor
def launchTask(context: ExecutorBackend, taskId: Long, attemptNumber: Int, taskName: String, serializedTask: ByteBuffer): Unit = {
// TaskRunner是一个Runnable,提交到线程池中执行
val tr = new TaskRunner(context, taskId = taskId, attemptNumber = attemptNumber, taskName, serializedTask)
runningTasks.put(taskId, tr)
threadPool.execute(tr)
}
TaskRunner
override def run(): Unit = {
// TaskMemoryManager负责管理Worker进程内存相关
val taskMemoryManager = new TaskMemoryManager(env.memoryManager, taskId)
val deserializeStartTime = System.currentTimeMillis()
// replClassLoader是 spark.repl.class.uri和org.apache.spark.repl.ExecutorClassLoader配置的外部Executor的Classloader,默认没有值
Thread.currentThread.setContextClassLoader(replClassLoader)
val ser = env.closureSerializer.newInstance()
logInfo(s"Running $taskName (TID $taskId)")
// 更新状态
execBackend.statusUpdate(taskId, TaskState.RUNNING, EMPTY_BYTE_BUFFER)
var taskStart: Long = 0
startGCTime = computeTotalGcTime()
try {
val (taskFiles, taskJars, taskBytes) = Task.deserializeWithDependencies(serializedTask) // 反序列化Task及其依赖(元数据)
updateDependencies(taskFiles, taskJars) // 下载/更新依赖
task = ser.deserialize[Task[Any]](taskBytes, Thread.currentThread.getContextClassLoader) // 反序列化Task本身
task.setTaskMemoryManager(taskMemoryManager) // 绑定内存管理器
if (killed) {// 如果已经killed,抛异常退出
throw new TaskKilledException
}
logDebug("Task " + taskId + "'s epoch is " + task.epoch)
// MapOutputTracker是管理Stage输出结果位置的,以便后续的Shuffle
env.mapOutputTracker.updateEpoch(task.epoch)
// 运行Task并记录运行时信息,value是计算结果,accumUpdates是对累加器的更新结果
taskStart = System.currentTimeMillis()
var threwException = true
val (value, accumUpdates) = try {
val res = task.run(taskAttemptId = taskId, attemptNumber = attemptNumber, metricsSystem = env.metricsSystem)
threwException = false
res
} finally {
// 清理内存,如果发现内存泄露则报警或退出
val freedMemory = taskMemoryManager.cleanUpAllAllocatedMemory()
if (freedMemory > 0) {
val errMsg = s"Managed memory leak detected; size = $freedMemory bytes, TID = $taskId"
if (conf.getBoolean("spark.unsafe.exceptionOnMemoryLeak", false) && !threwException) {
throw new SparkException(errMsg)
} else {
logError(errMsg)
}
}
}
val taskFinish = System.currentTimeMillis()
// If the task has been killed, let's fail it.
if (task.killed) {
throw new TaskKilledException
}
// 准备回发返回值(计算结果)
val resultSer = env.serializer.newInstance()
val beforeSerialization = System.currentTimeMillis()
val valueBytes = resultSer.serialize(value)
val afterSerialization = System.currentTimeMillis()
// 填充Task的统计信息,包括执行时间、GC时间、序列化时间等,并且更新累加器
for (m <- task.metrics) {
// 反序列化包括Task对象本身(包含分区信息)以及RDD和算子函数
m.setExecutorDeserializeTime((taskStart - deserializeStartTime) + task.executorDeserializeTime)
// 扣除反序列化时间,剩下的是算子执行时间
m.setExecutorRunTime((taskFinish - taskStart) - task.executorDeserializeTime)
m.setJvmGCTime(computeTotalGcTime() - startGCTime)
m.setResultSerializationTime(afterSerialization - beforeSerialization)
m.updateAccumulators()
}
// 封装DirectResult
val directResult = new DirectTaskResult(valueBytes, accumUpdates, task.metrics.orNull)
val serializedDirectResult = ser.serialize(directResult)
val resultSize = serializedDirectResult.limit
// directSend = sending directly back to the driver
val serializedResult: ByteBuffer = {
// 如果结果超过maxResultSize则丢弃这个结果
if (maxResultSize > 0 && resultSize > maxResultSize) {
logWarning(s"Finished $taskName (TID $taskId). Result is larger than maxResultSize " + s"(${Utils.bytesToString(resultSize)} > ${Utils.bytesToString(maxResultSize)}), " + s"dropping it.")
ser.serialize(new IndirectTaskResult[Any](TaskResultBlockId(taskId), resultSize))
} else if (resultSize >= akkaFrameSize - AkkaUtils.reservedSizeBytes) {
// 否则如果超过消息尺寸上限,则保存到持久化成block,并通过Indirect方式返回
val blockId = TaskResultBlockId(taskId)
env.blockManager.putBytes(blockId, serializedDirectResult, StorageLevel.MEMORY_AND_DISK_SER)
logInfo( s"Finished $taskName (TID $taskId). $resultSize bytes result sent via BlockManager)"}
ser.serialize(new IndirectTaskResult[Any](blockId, resultSize))
} else {
// 如果小于消息尺寸上限,就直接返回DirectResult
logInfo(s"Finished $taskName (TID $taskId). $resultSize bytes result sent to driver")
serializedDirectResult
}
}
// 发送StatusUpdate AKKA消息通知Driver Task计算完成并回发结果,控制权转回CoarseGrainedSchedulerBackend(发送实现在AkkaRpcEnv.send,接收在CoarseGrainedSchedulerBackend.DriverEndPoint.receive)
execBackend.statusUpdate(taskId, TaskState.FINISHED, serializedResult)
} catch {
// N多异常处理
case ffe: FetchFailedException =>// 获取文件失败
val reason = ffe.toTaskEndReason
execBackend.statusUpdate(taskId, TaskState.FAILED, ser.serialize(reason))
case _: TaskKilledException | _: InterruptedException if task.killed =>// 任务被Killed
logInfo(s"Executor killed $taskName (TID $taskId)")
execBackend.statusUpdate(taskId, TaskState.KILLED, ser.serialize(TaskKilled))
case cDE: CommitDeniedException =>// 上传失败(如hdfs)
val reason = cDE.toTaskEndReason
execBackend.statusUpdate(taskId, TaskState.FAILED, ser.serialize(reason))
case t: Throwable =>
// Attempt to exit cleanly by informing the driver of our failure. If anything goes wrong (or this was a fatal exception), we will delegate to the default uncaught exception handler, which will terminate the Executor.
logError(s"Exception in $taskName (TID $taskId)", t)
val metrics: Option[TaskMetrics] = Option(task).flatMap { task =>
task.metrics.map { m =>
m.setExecutorRunTime(System.currentTimeMillis() - taskStart)
m.setJvmGCTime(computeTotalGcTime() - startGCTime)
m.updateAccumulators()
m
}
}
val serializedTaskEndReason = {
try {
ser.serialize(new ExceptionFailure(t, metrics))
} catch {
case _: NotSerializableException =>
// t is not serializable so just send the stacktrace
ser.serialize(new ExceptionFailure(t, metrics, false))
}
}
execBackend.statusUpdate(taskId, TaskState.FAILED, serializedTaskEndReason)
// Don't forcibly exit unless the exception was inherently fatal, to avoid stopping other tasks unnecessarily.
if (Utils.isFatalError(t)) {
SparkUncaughtExceptionHandler.uncaughtException(t)
}
} finally {
runningTasks.remove(taskId)
}
}
// 下载缺失的file或jar依赖。并加载到classloader
private def updateDependencies(newFiles: HashMap[String, Long], newJars: HashMap[String, Long]) {
// 获取Hadoop配置
lazy val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf)
synchronized {
// file依赖(资源文件)
for ((name, timestamp) <- newFiles if currentFiles.getOrElse(name, -1L) < timestamp) {
logInfo("Fetching " + name + " with timestamp " + timestamp)
// Fetch file with useCache mode, close cache for local mode.
Utils.fetchFile(name, new File(SparkFiles.getRootDirectory()), conf, env.securityManager, hadoopConf, timestamp, useCache = !isLocal)
currentFiles(name) = timestamp
}
// jar依赖
for ((name, timestamp) <- newJars) {
val localName = name.split("/").last
val currentTimeStamp = currentJars.get(name) .orElse(currentJars.get(localName)).getOrElse(-1L)
if (currentTimeStamp < timestamp) {
logInfo("Fetching " + name + " with timestamp " + timestamp)
// Fetch file with useCache mode, close cache for local mode.
Utils.fetchFile(name, new File(SparkFiles.getRootDirectory()), conf, env.securityManager, hadoopConf, timestamp, useCache = !isLocal)
currentJars(name) = timestamp
// Jar需要额外添加到 classloader
val url = new File(SparkFiles.getRootDirectory(), localName).toURI.toURL
if (!urlClassLoader.getURLs().contains(url)) {
logInfo("Adding " + url + " to class loader")
urlClassLoader.addURL(url)
}
}
}
}
}
/* 下载文件或文件夹,文件支持Http、Hadoop、File FS文件只支持hadoop
如useCache==true, 首先尝试从本地缓存获取。本地缓存在运行同一Application的executor之间共享*
如果文件已存在但内容不一样,则抛出SparkException */
def fetchFile(url: String, targetDir: File, conf: SparkConf, securityMgr: SecurityManager, hadoopConf: Configuration, timestamp: Long, useCache: Boolean) {
val fileName = url.split("/").last
val targetFile = new File(targetDir, fileName)
val fetchCacheEnabled = conf.getBoolean("spark.files.useFetchCache", defaultValue = true)
if (useCache && fetchCacheEnabled) {
// 尝试拿缓存
val cachedFileName = s"${url.hashCode}${timestamp}_cache"
val lockFileName = s"${url.hashCode}${timestamp}_lock"
val localDir = new File(getLocalDir(conf))
val lockFile = new File(localDir, lockFileName)
val lockFileChannel = new RandomAccessFile(lockFile, "rw").getChannel()
// 文件锁,防止并发读写
val lock = lockFileChannel.lock()
val cachedFile = new File(localDir, cachedFileName)
try {
// 未命中缓存,则下载
if (!cachedFile.exists()) {
doFetchFile(url, localDir, cachedFileName, conf, securityMgr, hadoopConf)
}
} finally {
lock.release()
lockFileChannel.close()
}
copyFile(url, cachedFile, targetFile, conf.getBoolean("spark.files.overwrite", false))
} else {
// 未开启缓存,也直接下载,不同方式实现,hadoop、http等
doFetchFile(url, targetDir, fileName, conf, securityMgr, hadoopConf)
}
// 如果是压缩包,调用命令行解压
if (fileName.endsWith(".tar.gz") || fileName.endsWith(".tgz")) {
logInfo("Untarring " + fileName)
executeAndGetOutput(Seq("tar", "-xzf", fileName), targetDir)
} else if (fileName.endsWith(".tar")) {
logInfo("Untarring " + fileName)
executeAndGetOutput(Seq("tar", "-xf", fileName), targetDir)
}
// 更改执行权限,为脚本文件
FileUtil.chmod(targetFile.getAbsolutePath, "a+x")
// 更改windows下的只读权限
if (isWindows) {
FileUtil.chmod(targetFile.getAbsolutePath, "u+r")
}
}
相关推荐
1. **DataFrame/Dataset API强化**:Spark 2.2.0进一步提升了DataFrame和Dataset API的性能和易用性。DataFrame API提供了SQL-like查询接口,而Dataset API则支持类型安全和强类型编程。在这一版本中,API更加成熟,...
spark源码:spark-master.zip。方便不能登录GitHub的小伙伴下载。如果实在需要留言,可以私下给。
1. 解压压缩包:使用tar命令解压文件,例如`tar -xvf spark-3.1.3-bin-without-hadoop.tgz`。 2. 配置环境变量:在`~/.bashrc`或`~/.bash_profile`中设置SPARK_HOME,并将Spark的bin目录添加到PATH。 3. 如果在...
本压缩包“spark--bin-hadoop3-without-hive.tgz”提供了Spark二进制版本,针对Hadoop 3.1.3进行了编译和打包,这意味着它已经与Hadoop 3.x兼容,但不包含Hive组件。在CentOS 8操作系统上,这个版本的Spark已经被...
此外,可以通过`spark-submit`脚本提交应用程序到Spark集群执行,或直接在Spark Shell中交互式探索数据。 总结来说,Spark 2.4.7是大数据处理领域的重要工具,它的高性能、易用性和丰富的功能使其在数据科学和工程...
1. 下载:首先,你需要下载Spark 2.1.0与Hadoop 2.7兼容的二进制包,即`spark-2.1.0-bin-hadoop2.7.tgz`。 2. 解压:在Linux服务器上,使用`tar -zxvf spark-2.1.0-bin-hadoop2.7.tgz`命令解压文件。 3. 配置环境...
开发者可以根据需求选择合适的语言编写应用程序,然后使用`spark-submit`脚本来提交任务到集群。 **6. 性能调优** Spark性能优化主要包括内存管理、任务调度和数据本地性等方面。可以通过调整`spark.executor....
spark-hive_2.11-2.3.0 spark-hive-thriftserver_2.11-2.3.0.jar log4j-2.15.0.jar slf4j-api-1.7.7.jar slf4j-log4j12-1.7.25.jar curator-client-2.4.0.jar curator-framework-2.4.0.jar curator-recipes-2.4.0....
北风网spark课程源码spark-study-scala.rar,
Spark-3.1.2.tgz和Spark-3.1.2-bin-hadoop2.7.tgz是两个不同格式的Spark发行版,分别以tar.gz和rar压缩格式提供。 1. Spark核心概念: - RDD(弹性分布式数据集):Spark的基础数据结构,是不可变、分区的数据集合...
在这个特定的压缩包"spark-3.1.3-bin-hadoop3.2.tgz"中,我们得到了Spark的3.1.3版本,它已经预编译为与Hadoop 3.2兼容。这个版本的Spark不仅提供了源码,还包含了预编译的二进制文件,使得在Linux环境下快速部署和...
对于应用程序开发,可以使用Scala、Java、Python或R编写代码,然后通过`spark-submit`脚本提交作业到集群。 6. **性能优化**: Spark提供了一系列性能优化手段,如Tungsten内存管理、Code Generation、Shuffle优化等...
在本场景中,我们讨论的是Spark的3.0.0版本,与Hadoop3.2相结合的二进制发行版——"spark-3.0.0-bin-hadoop3.2"。这个压缩包是为了在Windows操作系统下运行Spark而设计的,因此标签明确指出它是适用于Windows平台的...
在Ubuntu里安装spark,spark-2.1.0-bin-without-hadoop该版本直接下载到本地后解压即可使用。 Apache Spark 是一种用于大数据工作负载的分布式开源处理系统。它使用内存中缓存和优化的查询执行方式,可针对任何规模...
2. Spark-assembly.jar的构成: Spark-assembly-1.5.2-hadoop2.6.0.jar是一个fat jar,包含了Spark的所有依赖,包括Spark自身的核心库、Hadoop客户端库以及其他相关的第三方库。这样做的目的是为了简化Spark应用...
该文件为2.4.7版本的spark包(spark-2.4.7-bin-hadoop2.7.tar.gz)
这个名为"spark-3.2.1-bin-hadoop2.7.tgz"的压缩包是Spark的一个特定版本,即3.2.1,与Hadoop 2.7版本兼容。在Linux环境下,这样的打包方式方便用户下载、安装和运行Spark。 Spark的核心设计理念是快速数据处理,...
然而,有时我们可能需要在不依赖 Hive JAR 包的情况下,使用 Spark 处理 Hive 上的数据,这就是"spark-2.3.0-bin-hadoop2-without-hive"这个软件包的目的。 Spark 2.3.0 是一个强大的分布式计算框架,其性能和灵活...
为了使用"spark-2.4.7-bin-without-hadoop",你需要首先下载并解压提供的spark-2.4.7-bin-without-hadoop.tgz文件。解压后,你可以找到包含Spark所有组件的目录结构,包括Spark的可执行文件、配置文件以及相关的库...
在使用Spark时,你可以通过`spark-submit`命令提交应用程序,或者直接在Spark Shell中编写和运行代码。 总的来说,Spark 2.4.8是一个强大且灵活的大数据处理框架,它通过其丰富的组件和优化的性能,为开发者提供了...