- 浏览: 55955 次
- 性别:
- 来自: 广州
文章分类
最新评论
SQL context available as sqlContext.
scala> var myVar : String = "Foo"
myVar: String = Foo
scala> val myVal : String = "Foo"
myVal: String = Foo
scala> var myVar : String = "Foo1"
myVar: String = Foo1
scala> myVal="aa"
<console>:27: error: reassignment to val
myVal="aa"
^
scala&gt; myVal="aa";
<console>:27: error: reassignment to val
myVal="aa";
^
scala&gt; myVal="Foo";
<console>:27: error: reassignment to val
myVal="Foo";
^
scala&gt; myVar="jack"
myVar: String = jack
scala&gt; var myVar = 10;
myVar: Int = 10
scala&gt; val myVal = "Hello, Scala!";
myVal: String = Hello, Scala!
scala&gt; val (myVar1: Int, myVar2: String) = Pair(40, "Foo")
myVar1: Int = 40
myVar2: String = Foo
scala&gt; val jack: (myVar1: Int, myVar2: String) = Pair(40, "Foo")
<console>:1: error: ')' expected but ':' found.
val jack: (myVar1: Int, myVar2: String) = Pair(40, "Foo")
^
scala&gt; val jack: (myVar1: Int, myVar2: String) = Pair(40, "Foo")
<console>:1: error: ')' expected but ':' found.
val jack: (myVar1: Int, myVar2: String) = Pair(40, "Foo")
^
scala&gt; val (myVar1, myVar2) = Pair(40, "Foo")
myVar1: Int = 40
myVar2: String = Foo
scala&gt; object Test{
| def main(args:Array[String]){
| var x = 10
| if(x Test
res0: Test.type = $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$Test$@63843e11
scala&gt; Test
res1: Test.type = $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$Test$@63843e11
scala&gt; Test.main("a")
<console>:28: error: type mismatch;
found : String("a")
required: Array[String]
Test.main("a")
^
scala&gt; Test.main(List.("a"))
<console>:1: error: identifier expected but '(' found.
Test.main(List.("a"))
^
scala&gt; val arr = sc.parallelize(Array(("A",1),("B",2),("C",3)));
arr: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[0] at parallelize at <console>:27
scala&gt; arr.flatmap(x=&gt;(x._1+x_2)).foreach(println);
<console>:30: error: value flatmap is not a member of org.apache.spark.rdd.RDD[(String, Int)]
arr.flatmap(x=&gt;(x._1+x_2)).foreach(println);
^
scala&gt; arr.map(x=&gt;(x._1+x_2)).foreach(println);
<console>:30: error: not found: value x_2
arr.map(x=&gt;(x._1+x_2)).foreach(println);
^
scala&gt; arr.map(x=&gt;(x._1+x._2)).foreach(println);
scala&gt; val arr = sc.parallelize(Array(("A",1),("B",2),("C",3)));
arr: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[2] at parallelize at <console>:27
scala&gt; arr.map(x=&gt;(x._1+x._2)).foreach(println);
scala&gt; List("a","b").foreach(println);
a
b
scala&gt; val arr = sc.parallelize(Array(("A",1),("B",2),("C",3)));
arr: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[4] at parallelize at <console>:27
scala&gt; println(arr);
ParallelCollectionRDD[4] at parallelize at <console>:27
scala&gt; arr.foreach(println);
scala&gt; val arr=sc.parallelize(Array(("A",1),("B",2),("C",3)))
arr: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[5] at parallelize at <console>:27
scala&gt; arr.flatmap(x=&gt;(x._1+x._2)).foreach(println)
<console>:30: error: value flatmap is not a member of org.apache.spark.rdd.RDD[(String, Int)]
arr.flatmap(x=&gt;(x._1+x._2)).foreach(println)
^
scala&gt; val arr=sc.parallelize(Array(("A",1),("B",2),("C",3)))
arr: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[6] at parallelize at <console>:27
scala&gt; arr.map(x=&gt;(x._1+x._2)).foreach(println)
scala&gt; arr.flatMap(x=&gt;(x._1+x._2)).foreach(println)
scala&gt; arr.flatMap(x=&gt;(x._1+x._2)).foreach(println)
scala&gt; arr.flatMap(x=&gt;(x._1+x._2)).foreach(println)
scala&gt; arr.map(x=&gt;(x._1+","+x._2)).foreach(println)
scala&gt; arr.first();
res16: (String, Int) = (A,1)
scala&gt; arr.count()
res17: Long = 3
scala&gt; val rdd = sc.parallelize(1,10,2);
<console>:27: error: too many arguments for method parallelize: (seq: Seq[T], numSlices: Int)(implicit evidence$1: scala.reflect.ClassTag[T])org.apache.spark.rdd.RDD[T]
val rdd = sc.parallelize(1,10,2);
^
scala&gt; val rdd = sc.parallelize(1 to 10,2);
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[12] at parallelize at <console>:27
scala&gt; val reduceRDD = rdd.reduce(_ + _)
reduceRDD: Int = 55
scala&gt; rdd.first()
res18: Int = 1
scala&gt; rdd.count
res19: Long = 10
scala&gt; val reduceRDD1 = rdd.reduce(_ - _)
reduceRDD1: Int = 15
scala&gt; val countRDD = rdd.count()
countRDD: Long = 10
scala&gt; val firstRDD = rdd.first()
firstRDD: Int = 1
scala&gt; val takeRDD = rdd.take(5)
takeRDD: Array[Int] = Array(1, 2, 3, 4, 5)
scala&gt; val topRDD = rdd.top(3)
topRDD: Array[Int] = Array(10, 9,
scala&gt; val takeOrderedRDD = rdd.takeOrdered(3)
takeOrderedRDD: Array[Int] = Array(1, 2, 3)
scala&gt; println("func +: "+reduceRDD)
func +: 55
scala&gt; println("func -: "+reduceRDD1)
func -: 15
scala&gt; println("count: "+countRDD)
count: 10
scala&gt; println("first: "+firstRDD)
first: 1
scala&gt; println("take:")
take:
scala&gt; takeRDD.foreach(x =&gt; print(x +" "))
1 2 3 4 5
scala&gt; takeRDD.foreach(x =&gt; println(x +" "))
1
2
3
4
5
scala&gt; arr.flatMap(x=&gt;(x._1+x._2)).foreach(x=&gt;println(x +""))
scala&gt; val arr=sc.parallelize(Array(("A",1),("B",2),("C",3)))
arr: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[16] at parallelize at <console>:27
scala&gt; arr.take(1)
res28: Array[(String, Int)] = Array((A,1))
scala&gt; arr.foreach(x=&gt;println(x._1 +","+x._2))
scala&gt; arr.lookup("A");
res30: Seq[Int] = WrappedArray(1)
scala&gt; arr.countByKey()
res31: scala.collection.Map[String,Long] = Map(B -&gt; 1, A -&gt; 1, C -&gt; 1)
scala&gt; arr.collectAsMap
res32: scala.collection.Map[String,Int] = Map(A -&gt; 1, C -&gt; 3, B -&gt; 2)
scala&gt; val arr = List(("A", 1), ("B", 2), ("A", 2), ("B", 3))
arr: List[(String, Int)] = List((A,1), (B,2), (A,2), (B,3))
scala&gt; val rdd = sc.parallelize(arr,2)
rdd: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[21] at parallelize at <console>:29
scala&gt; val countByKeyRDD = rdd.countByKey()
countByKeyRDD: scala.collection.Map[String,Long] = Map(B -&gt; 2, A -&gt; 2)
scala&gt; val collectAsMapRDD = rdd.collectAsMap()
collectAsMapRDD: scala.collection.Map[String,Int] = Map(A -&gt; 2, B -&gt; 3)
scala&gt; countByKeyRDD.foreach(print)
(B,2)(A,2)
scala&gt; collectAsMapRDD.foreach(print)
(A,2)(B,3)
scala&gt; val rdd = sc.parallelize(List(1,2,3,4),2)
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[24] at parallelize at <console>:27
scala&gt; val aggregateRDD = rdd.aggregate(2)(_+_,_ * _)
aggregateRDD: Int = 90
scala&gt; println(aggregateRDD)
90
scala&gt; def setOp(a:String,b:String):String={
| println("seqOp:"+a+"\t"+b);
| math.min(a.length, b.length).toString();
| }
setOp: (a: String, b: String)String
scala&gt; def combOp(a:String,b:String):String = {
| println("combOp:"+a+"\t"+b);
| a+b;
| }
combOp: (a: String, b: String)String
scala&gt; val z = sc.parallelize(List("12","23","345","4567"),2);
z: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[25] at parallelize at <console>:27
scala&gt; z.aggregate("")(seqOp,combOp);
<console>:32: error: not found: value seqOp
z.aggregate("")(seqOp,combOp);
^
scala&gt; def seqOp(a:String,b:String):String={
| println("seqOp:"+a+"\t"+b);
| math.min(a.length, b.length).toString();
| }
seqOp: (a: String, b: String)String
scala&gt; z.aggregate("")(seqOp,combOp);
[Stage 30:&gt; (0 + 2) / 2]combOp: 1
[Stage 30:=============================&gt; (1 + 1) / 2]combOp:1 1
res37: String = 11
scala&gt; z.aggregate("")(seqOp,combOp);
combOp: 1
combOp:1 1
res38: String = 11
scala&gt; z.aggregate("")(seqOp,combOp);
combOp: 1
combOp:1 1
res39: String = 11
scala&gt; def seqOp(a:String,b:String):String={
| println("seqOp jack:"+a+"\t"+b);
| math.min(a.length, b.length).toString();
| }
seqOp: (a: String, b: String)String
scala&gt; def combOp(a:String,b:String):String = {
| println("combOp jack:"+a+"\t"+b);
| a+b;
| }
combOp: (a: String, b: String)String
scala&gt; val z = sc.parallelize(List("12","23","345","4567"),2);
z: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[26] at parallelize at <console>:27
scala&gt; z.aggregate("")(seqOp,combOp);
[Stage 33:&gt; (0 + 2) / 2]combOp jack: 1
[Stage 33:=============================&gt; (1 + 1) / 2]combOp jack:1 1
res40: String = 11
scala&gt; z.aggregate("")(seqOp,combOp);
combOp jack: 1
[Stage 34:=============================&gt; (1 + 1) / 2]combOp jack:1 1
res41: String = 11
scala&gt; z.aggregate("")(seqOp,combOp);
[Stage 35:&gt; (0 + 0) / 2]combOp jack: 1
combOp jack:1 1
res42: String = 11
scala&gt; z.first()
res43: String = 12
scala&gt; z.top(4)
res44: Array[String] = Array(4567, 345, 23, 12)
scala&gt; z.count
res45: Long = 4
scala&gt; val z = List("12","23","345","4567");
z: List[String] = List(12, 23, 345, 4567)
scala&gt; z.aggregate("")(seqOp,combOp);
seqOp jack: 12
seqOp jack:0 23
seqOp jack:1 345
seqOp jack:1 4567
res46: String = 1
scala&gt; def seqOp(a:String,b:String):String={
| println("seqOp jack:"+a+"\t"+b);
| println("return:"+math.min(a.length, b.length).toString());
| return math.min(a.length, b.length).toString();
| }
seqOp: (a: String, b: String)String
scala&gt; def combOp(a:String,b:String):String = {
| println("combOp jack:"+a+"\t"+b);
| a+b;
| }
combOp: (a: String, b: String)String
scala&gt; val z = List("12","23","345","4567");
z: List[String] = List(12, 23, 345, 4567)
scala&gt; z.aggregate("")(seqOp,combOp);
seqOp jack: 12
return:0
seqOp jack:0 23
return:1
seqOp jack:1 345
return:1
seqOp jack:1 4567
return:1
res47: String = 1
scala&gt; countByKeyRDD.foreach(print)
(B,2)(A,2)
scala&gt; collectAsMapRDD.foreach(print)
(A,2)(B,3)
scala&gt; collectAsMapRDD.saveAsTextFile("hdfs://192.168.1.56/tmp/jackteset")
<console>:34: error: value saveAsTextFile is not a member of scala.collection.Map[String,Int]
collectAsMapRDD.saveAsTextFile("hdfs://192.168.1.56/tmp/jackteset")
^
scala&gt; var rdd1 = sc.makeRDD(1 to 10,2)
rdd1: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[28] at makeRDD at <console>:27
scala&gt; rdd1.saveAsTextFile("hdfs://192.168.1.56/tmp/jackteset")
scala&gt; rdd1.saveAsTextFile("hdfs://192.168.1.56/tmp/jackteset1",classOf[org.apache.hadoop.io.compress.SnappyCodec])</console></console></console></console></console></console></console></console></console></console></console></console></console></console></console></console></console></console></console></console></console></console></console></console></console></console>
scala> var myVar : String = "Foo"
myVar: String = Foo
scala> val myVal : String = "Foo"
myVal: String = Foo
scala> var myVar : String = "Foo1"
myVar: String = Foo1
scala> myVal="aa"
<console>:27: error: reassignment to val
myVal="aa"
^
scala&gt; myVal="aa";
<console>:27: error: reassignment to val
myVal="aa";
^
scala&gt; myVal="Foo";
<console>:27: error: reassignment to val
myVal="Foo";
^
scala&gt; myVar="jack"
myVar: String = jack
scala&gt; var myVar = 10;
myVar: Int = 10
scala&gt; val myVal = "Hello, Scala!";
myVal: String = Hello, Scala!
scala&gt; val (myVar1: Int, myVar2: String) = Pair(40, "Foo")
myVar1: Int = 40
myVar2: String = Foo
scala&gt; val jack: (myVar1: Int, myVar2: String) = Pair(40, "Foo")
<console>:1: error: ')' expected but ':' found.
val jack: (myVar1: Int, myVar2: String) = Pair(40, "Foo")
^
scala&gt; val jack: (myVar1: Int, myVar2: String) = Pair(40, "Foo")
<console>:1: error: ')' expected but ':' found.
val jack: (myVar1: Int, myVar2: String) = Pair(40, "Foo")
^
scala&gt; val (myVar1, myVar2) = Pair(40, "Foo")
myVar1: Int = 40
myVar2: String = Foo
scala&gt; object Test{
| def main(args:Array[String]){
| var x = 10
| if(x Test
res0: Test.type = $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$Test$@63843e11
scala&gt; Test
res1: Test.type = $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$Test$@63843e11
scala&gt; Test.main("a")
<console>:28: error: type mismatch;
found : String("a")
required: Array[String]
Test.main("a")
^
scala&gt; Test.main(List.("a"))
<console>:1: error: identifier expected but '(' found.
Test.main(List.("a"))
^
scala&gt; val arr = sc.parallelize(Array(("A",1),("B",2),("C",3)));
arr: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[0] at parallelize at <console>:27
scala&gt; arr.flatmap(x=&gt;(x._1+x_2)).foreach(println);
<console>:30: error: value flatmap is not a member of org.apache.spark.rdd.RDD[(String, Int)]
arr.flatmap(x=&gt;(x._1+x_2)).foreach(println);
^
scala&gt; arr.map(x=&gt;(x._1+x_2)).foreach(println);
<console>:30: error: not found: value x_2
arr.map(x=&gt;(x._1+x_2)).foreach(println);
^
scala&gt; arr.map(x=&gt;(x._1+x._2)).foreach(println);
scala&gt; val arr = sc.parallelize(Array(("A",1),("B",2),("C",3)));
arr: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[2] at parallelize at <console>:27
scala&gt; arr.map(x=&gt;(x._1+x._2)).foreach(println);
scala&gt; List("a","b").foreach(println);
a
b
scala&gt; val arr = sc.parallelize(Array(("A",1),("B",2),("C",3)));
arr: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[4] at parallelize at <console>:27
scala&gt; println(arr);
ParallelCollectionRDD[4] at parallelize at <console>:27
scala&gt; arr.foreach(println);
scala&gt; val arr=sc.parallelize(Array(("A",1),("B",2),("C",3)))
arr: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[5] at parallelize at <console>:27
scala&gt; arr.flatmap(x=&gt;(x._1+x._2)).foreach(println)
<console>:30: error: value flatmap is not a member of org.apache.spark.rdd.RDD[(String, Int)]
arr.flatmap(x=&gt;(x._1+x._2)).foreach(println)
^
scala&gt; val arr=sc.parallelize(Array(("A",1),("B",2),("C",3)))
arr: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[6] at parallelize at <console>:27
scala&gt; arr.map(x=&gt;(x._1+x._2)).foreach(println)
scala&gt; arr.flatMap(x=&gt;(x._1+x._2)).foreach(println)
scala&gt; arr.flatMap(x=&gt;(x._1+x._2)).foreach(println)
scala&gt; arr.flatMap(x=&gt;(x._1+x._2)).foreach(println)
scala&gt; arr.map(x=&gt;(x._1+","+x._2)).foreach(println)
scala&gt; arr.first();
res16: (String, Int) = (A,1)
scala&gt; arr.count()
res17: Long = 3
scala&gt; val rdd = sc.parallelize(1,10,2);
<console>:27: error: too many arguments for method parallelize: (seq: Seq[T], numSlices: Int)(implicit evidence$1: scala.reflect.ClassTag[T])org.apache.spark.rdd.RDD[T]
val rdd = sc.parallelize(1,10,2);
^
scala&gt; val rdd = sc.parallelize(1 to 10,2);
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[12] at parallelize at <console>:27
scala&gt; val reduceRDD = rdd.reduce(_ + _)
reduceRDD: Int = 55
scala&gt; rdd.first()
res18: Int = 1
scala&gt; rdd.count
res19: Long = 10
scala&gt; val reduceRDD1 = rdd.reduce(_ - _)
reduceRDD1: Int = 15
scala&gt; val countRDD = rdd.count()
countRDD: Long = 10
scala&gt; val firstRDD = rdd.first()
firstRDD: Int = 1
scala&gt; val takeRDD = rdd.take(5)
takeRDD: Array[Int] = Array(1, 2, 3, 4, 5)
scala&gt; val topRDD = rdd.top(3)
topRDD: Array[Int] = Array(10, 9,
scala&gt; val takeOrderedRDD = rdd.takeOrdered(3)
takeOrderedRDD: Array[Int] = Array(1, 2, 3)
scala&gt; println("func +: "+reduceRDD)
func +: 55
scala&gt; println("func -: "+reduceRDD1)
func -: 15
scala&gt; println("count: "+countRDD)
count: 10
scala&gt; println("first: "+firstRDD)
first: 1
scala&gt; println("take:")
take:
scala&gt; takeRDD.foreach(x =&gt; print(x +" "))
1 2 3 4 5
scala&gt; takeRDD.foreach(x =&gt; println(x +" "))
1
2
3
4
5
scala&gt; arr.flatMap(x=&gt;(x._1+x._2)).foreach(x=&gt;println(x +""))
scala&gt; val arr=sc.parallelize(Array(("A",1),("B",2),("C",3)))
arr: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[16] at parallelize at <console>:27
scala&gt; arr.take(1)
res28: Array[(String, Int)] = Array((A,1))
scala&gt; arr.foreach(x=&gt;println(x._1 +","+x._2))
scala&gt; arr.lookup("A");
res30: Seq[Int] = WrappedArray(1)
scala&gt; arr.countByKey()
res31: scala.collection.Map[String,Long] = Map(B -&gt; 1, A -&gt; 1, C -&gt; 1)
scala&gt; arr.collectAsMap
res32: scala.collection.Map[String,Int] = Map(A -&gt; 1, C -&gt; 3, B -&gt; 2)
scala&gt; val arr = List(("A", 1), ("B", 2), ("A", 2), ("B", 3))
arr: List[(String, Int)] = List((A,1), (B,2), (A,2), (B,3))
scala&gt; val rdd = sc.parallelize(arr,2)
rdd: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[21] at parallelize at <console>:29
scala&gt; val countByKeyRDD = rdd.countByKey()
countByKeyRDD: scala.collection.Map[String,Long] = Map(B -&gt; 2, A -&gt; 2)
scala&gt; val collectAsMapRDD = rdd.collectAsMap()
collectAsMapRDD: scala.collection.Map[String,Int] = Map(A -&gt; 2, B -&gt; 3)
scala&gt; countByKeyRDD.foreach(print)
(B,2)(A,2)
scala&gt; collectAsMapRDD.foreach(print)
(A,2)(B,3)
scala&gt; val rdd = sc.parallelize(List(1,2,3,4),2)
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[24] at parallelize at <console>:27
scala&gt; val aggregateRDD = rdd.aggregate(2)(_+_,_ * _)
aggregateRDD: Int = 90
scala&gt; println(aggregateRDD)
90
scala&gt; def setOp(a:String,b:String):String={
| println("seqOp:"+a+"\t"+b);
| math.min(a.length, b.length).toString();
| }
setOp: (a: String, b: String)String
scala&gt; def combOp(a:String,b:String):String = {
| println("combOp:"+a+"\t"+b);
| a+b;
| }
combOp: (a: String, b: String)String
scala&gt; val z = sc.parallelize(List("12","23","345","4567"),2);
z: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[25] at parallelize at <console>:27
scala&gt; z.aggregate("")(seqOp,combOp);
<console>:32: error: not found: value seqOp
z.aggregate("")(seqOp,combOp);
^
scala&gt; def seqOp(a:String,b:String):String={
| println("seqOp:"+a+"\t"+b);
| math.min(a.length, b.length).toString();
| }
seqOp: (a: String, b: String)String
scala&gt; z.aggregate("")(seqOp,combOp);
[Stage 30:&gt; (0 + 2) / 2]combOp: 1
[Stage 30:=============================&gt; (1 + 1) / 2]combOp:1 1
res37: String = 11
scala&gt; z.aggregate("")(seqOp,combOp);
combOp: 1
combOp:1 1
res38: String = 11
scala&gt; z.aggregate("")(seqOp,combOp);
combOp: 1
combOp:1 1
res39: String = 11
scala&gt; def seqOp(a:String,b:String):String={
| println("seqOp jack:"+a+"\t"+b);
| math.min(a.length, b.length).toString();
| }
seqOp: (a: String, b: String)String
scala&gt; def combOp(a:String,b:String):String = {
| println("combOp jack:"+a+"\t"+b);
| a+b;
| }
combOp: (a: String, b: String)String
scala&gt; val z = sc.parallelize(List("12","23","345","4567"),2);
z: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[26] at parallelize at <console>:27
scala&gt; z.aggregate("")(seqOp,combOp);
[Stage 33:&gt; (0 + 2) / 2]combOp jack: 1
[Stage 33:=============================&gt; (1 + 1) / 2]combOp jack:1 1
res40: String = 11
scala&gt; z.aggregate("")(seqOp,combOp);
combOp jack: 1
[Stage 34:=============================&gt; (1 + 1) / 2]combOp jack:1 1
res41: String = 11
scala&gt; z.aggregate("")(seqOp,combOp);
[Stage 35:&gt; (0 + 0) / 2]combOp jack: 1
combOp jack:1 1
res42: String = 11
scala&gt; z.first()
res43: String = 12
scala&gt; z.top(4)
res44: Array[String] = Array(4567, 345, 23, 12)
scala&gt; z.count
res45: Long = 4
scala&gt; val z = List("12","23","345","4567");
z: List[String] = List(12, 23, 345, 4567)
scala&gt; z.aggregate("")(seqOp,combOp);
seqOp jack: 12
seqOp jack:0 23
seqOp jack:1 345
seqOp jack:1 4567
res46: String = 1
scala&gt; def seqOp(a:String,b:String):String={
| println("seqOp jack:"+a+"\t"+b);
| println("return:"+math.min(a.length, b.length).toString());
| return math.min(a.length, b.length).toString();
| }
seqOp: (a: String, b: String)String
scala&gt; def combOp(a:String,b:String):String = {
| println("combOp jack:"+a+"\t"+b);
| a+b;
| }
combOp: (a: String, b: String)String
scala&gt; val z = List("12","23","345","4567");
z: List[String] = List(12, 23, 345, 4567)
scala&gt; z.aggregate("")(seqOp,combOp);
seqOp jack: 12
return:0
seqOp jack:0 23
return:1
seqOp jack:1 345
return:1
seqOp jack:1 4567
return:1
res47: String = 1
scala&gt; countByKeyRDD.foreach(print)
(B,2)(A,2)
scala&gt; collectAsMapRDD.foreach(print)
(A,2)(B,3)
scala&gt; collectAsMapRDD.saveAsTextFile("hdfs://192.168.1.56/tmp/jackteset")
<console>:34: error: value saveAsTextFile is not a member of scala.collection.Map[String,Int]
collectAsMapRDD.saveAsTextFile("hdfs://192.168.1.56/tmp/jackteset")
^
scala&gt; var rdd1 = sc.makeRDD(1 to 10,2)
rdd1: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[28] at makeRDD at <console>:27
scala&gt; rdd1.saveAsTextFile("hdfs://192.168.1.56/tmp/jackteset")
scala&gt; rdd1.saveAsTextFile("hdfs://192.168.1.56/tmp/jackteset1",classOf[org.apache.hadoop.io.compress.SnappyCodec])</console></console></console></console></console></console></console></console></console></console></console></console></console></console></console></console></console></console></console></console></console></console></console></console></console></console>
- JAVAalgorithm.pdf (285.2 KB)
- 下载次数: 2
发表评论
-
kerberos 5
2015-12-14 21:42 1389kadmin.local: listprincs K/M@J ... -
hadoop network
2015-11-21 23:09 673[root@hadoopmaster ~]# cat /etc ... -
distcp
2015-11-18 21:47 1933[hadoop@hadoopmaster test]$ had ... -
hadoop常用命令
2015-09-19 14:58 616namenode(hdfs)+jobtracker ... -
hive之jdbc
2015-09-06 16:08 664import java.sql.Connection; imp ... -
hadoop fs, hdfs dfs, hadoop dfs科普
2015-09-06 11:14 976stackoverflow Following are th ... -
pig call hcatalog
2015-08-30 16:33 1531[hadoop@hadoopmaster ~]$ pig pi ... -
hadoop fsck
2015-08-25 22:59 504hadoop fsck Usage: DFSck < ... -
hcatalog study
2015-08-25 22:41 592https://cwiki.apache.org/conflu ... -
pig
2015-08-22 15:34 817hdfs://hadoopmaster:9000/user ... -
pig
2015-08-19 23:03 516http://blog.csdn.net/zythy/ar ... -
hadoop jar command
2015-08-16 22:57 526hadoop jar /opt/jack.jar org.a ... -
mapreduce
2015-08-16 21:56 289ChainMapper 支持多个map reduce 参 ... -
sqoop command
2015-06-06 18:54 5981. list the database sqoop ... -
maven
2015-01-15 21:39 450xxxx -
hadoop 2.5.2安装实录
2014-12-09 23:35 8301. prepare the virtual enviro ...
相关推荐
Spark 可以使用 textFile 读取文件,并使用 sqlContext 读取 JSON 文件和 Avro 文件等。不同的文件格式和压缩方式对 Spark 生产优化有着不同的影响。 Spark Web-UI 查看 Spark Web-UI 是一个非常有用的工具,可以...
Snappy是由Google开发的一个高效、轻量级的压缩和解压缩库,主要用于提高大数据处理的速度,常见于Hadoop、Spark等大数据处理框架中。它主要关注速度而非压缩比,适用于需要快速读写大量数据的场景。 描述中提到的...
- **大数据处理**:Snappy常被Hadoop、Spark等大数据框架用于中间数据的压缩,以减少I/O开销。 - **数据库系统**:例如,Google的Bigtable和Apache Cassandra都使用Snappy压缩存储数据。 - **日志收集**:在日志...
- 数据处理:Apache Spark、Apache Hadoop MapReduce等大数据处理框架利用Snappy减少数据传输时间和内存占用。 - 缓存系统:由于其快速的解压缩性能,Snappy也常被用在缓存系统中,如Memcached。 6. **Snappy的...
例如,在HDFS(Hadoop Distributed File System)中,Snappy可以作为文件的存储格式,使得数据在写入和读取时自动进行压缩和解压缩。 除此之外,Snappy还常用于日志收集系统,如Fluentd和Logstash,这些系统需要...
1. **数据压缩**:Snappy可以用于HDFS(Hadoop Distributed File System)中的数据压缩,减小存储空间,提高网络传输效率。 2. **MapReduce优化**:在MapReduce任务中,使用Snappy压缩可以减少数据传输时间和内存...
hadoop fs -put -compress snappy localfile hdfs://namenode:port/remotefile ``` Snappy在Hadoop中的使用并不局限于MapReduce,它也适用于其他组件,如HBase和Spark。在HBase中,可以设置表的存储层(StoreFile)...
Snappy是一款由Google开发的高效数据压缩库,主要用于提高大数据处理中的速度,尤其是在Hadoop、Spark等大数据处理框架中广泛应用。这个压缩算法的设计目标不是为了达到最佳的压缩比,而是追求快速的压缩和解压缩...
Snappy是一款由Google开发的高效数据压缩库,主要用于提高大数据处理中的I/O速度。它以其快速的压缩和解压缩性能而著称,尤其适合在内存有限的系统中使用,例如在分布式计算或云计算环境中。标题“Snappy vc2010版”...
赠送jar包:snappy-java-1.1.8.2.jar; 赠送原API文档:snappy-java-1.1.8.2-javadoc.jar; 赠送源代码:snappy-java-1.1.8.2-sources.jar; 赠送Maven依赖信息文件:snappy-java-1.1.8.2.pom; 包含翻译后的API文档...
在分布式计算系统如Hadoop、Spark以及各种NoSQL数据库中,Snappy被广泛使用,因为它能够在保持较高压缩率的同时,提供非常快速的压缩和解压缩速度。 在描述中提到的"snappy1.1.0安装包"是Snappy库的一个特定版本,...
这对于大数据处理、日志记录、存储和网络传输等场景特别有用,例如在Hadoop、Spark等大数据处理框架中,Snappy被广泛用作数据压缩算法。 Snappy的设计目标是为高速度而优化,而非最小化文件大小。因此,与Gzip或...
在Hadoop生态系统中,Snappy扮演着重要的角色,尤其是在HDFS(Hadoop Distributed File System)中。Hadoop默认支持多种压缩格式,包括Gzip、Bzip2和Snappy。由于Snappy的高速度,它通常被用于MapReduce任务的中间...
Snappy is a compression/decompression library. It does not aim for maximum compression, or compatibility with any other compression library; instead, it aims for very high speeds and reasonable ...
它在许多开源项目中被广泛采用,特别是在Hadoop、Spark等大数据处理框架中,因为其速度非常快,虽然压缩比相对较低,但非常适合对速度有高要求的场景。 在`snappy-1.0.5`这个版本中,我们可以深入理解Snappy的设计...
赠送jar包:snappy-java-1.0.4.1.jar; 赠送原API文档:snappy-java-1.0.4.1-javadoc.jar; 赠送源代码:snappy-java-1.0.4.1-sources.jar; 赠送Maven依赖信息文件:snappy-java-1.0.4.1.pom; 包含翻译后的API文档...
在Hadoop中,Snappy被用作MapReduce任务和HDFS(Hadoop Distributed File System)的数据压缩选项。 2. **Hadoop Native Libraries**: Hadoop原生库是一组C++编写的库,它们提供了与操作系统交互的功能,例如文件...
3. 创建对象:实例化Snappy类,如`$snappy = new Pdf('/path/to/wkhtmltopdf')`或`$snappy = new Image('/path/to/wkhtmltoimage')`。 4. 转换操作:使用`generateFromHtml()`、`generateFromUrl()`等方法,将HTML...