低阶workcount
package tcode.chapter07import scala.io.Sourceobject $17_WordCountLow { def main(args: Array[String]): Unit = { //1、读取文件 val datas = Source.fromFile("datas/wc2.txt","utf-8").getLines().toList //List(hello hadoop flume kafka,kafka spark scala hadoop,hello java python hadoop,kafka flume spark spark,hello flume scala java) //2、切割+压平 val words = datas.flatMap(line=> line.split(" ")) //List(hello,hadoop,flume,kafka,kafka,spark,....) //3、按照单词分组 val groupedMap = words.groupBy(x=>x) //Map( // hello-> List(hello,hello,hello,hello,..) // ... // ) // //4、统计次数 val result = groupedMap.map(x=>{ //x = hello-> List(hello,hello,hello,hello,..) (x._1,x._2.size) }) //List( (单词,总次数),(单词,总次数),... ) result.foreach(x=>println(x)) println("-"*100) // Source.fromFile("datas/wc.txt","utf-8").getLines().toList.flatMap(_.split(" ")).groupBy(x=>x).map(x=>(x._1,x._2.size)).foreach(println(_)) }}
高阶workcount
package tcode.chapter07object $18_WordCountHight { def main(args: Array[String]): Unit = { val tupleList = List(("Hello Scala Spark World", 4), ("Hello Scala Spark", 3), ("Hello Scala", 2), ("Hello", 1)) //1、切割+压平,给单词赋予初始次数 val words = tupleList.flatMap(x=>{ //x = ("Hello Scala Spark World", 4) //切割 val arr = x._1.split(" ") //Array(Hello,Scala,Spark,Word) val tu = arr.map(y=>{ //y = Hello (y,x._2) }) tu }) //List( (Hello,4),(Scala,4),(Spark,4),(Word,4),(Hello,3),(Scala,3),(Spark,3),(Hello,2),(Scala,2),(Hello,1) ) //2、按照单词分组 val groupedMap = words.groupBy(x=> x._1) //Map( // Hello-> List( (Hello,4), (Hello,3), (Hello,2), (Hello,1) ) // .... // ) //3、统计单词个数 val result = groupedMap.map(x=>{ //x = Hello-> List( (Hello,4), (Hello,3), (Hello,2), (Hello,1) ) //val r = x._2.reduce((agg,curr)=> (agg._1, agg._2+curr._2)) //r val r = x._2.map(y=>y._2).sum (x._1, r ) }) //4、结果展示 result.foreach(x=>println(x)) //List((Hello,10),(Scala,9),(Spark,7),(World,4)) }}