Merge pull request #197 from JohnSnowLabs/fix-vivekn-train-with-col

saif-ellafi · web-flow · commit e0421831e809 · 2018-05-02T11:10:03.000-07:00
Fixed Vivekn sentiment analysis when training from dataset
diff --git a/CHANGELOG b/CHANGELOG
@@ -13,19 +13,21 @@ Bug fixes
 * Fixed a bug causing the library to fail when trying to save or read an annotator with an unset Feature without default
 * Added missing default Param value to SentenceDetector. Thanks @superman24-7
 * Symmetric spell checker now utilizes List instead of ListBuffer on its prediction layer
-
----------------
-Other
----------------
-* Downloader now works retroactively when a newer version finds a model of a previous release
-* Renamed folder argument to remote_loc for downloader remote location, which caused confusion. Thanks @AtulSehgal
+* Fixed Vivekn Sentiment Analysis failing when training with a sentiment column
 
 ---------------
 Models
 ---------------
 * Symmetric Spell Checker pretrained model now works well and may be downloaded
 * Vivekn Sentiment pretrained model now defaults to "token" input column instead of "spell"
 
+---------------
+Other
+---------------
+* Downloader now works retroactively when a newer version finds a model of a previous release
+* Renamed folder argument to remote_loc for downloader remote location, which caused confusion. Thanks @AtulSehgal
+* Added new Scala example in example folder, also available on website
+
 ========
 1.5.2
 ========
diff --git a/docs/notebooks.html b/docs/notebooks.html
@@ -79,6 +79,34 @@ <h1 class="doc-title"><span aria-hidden="true" class="icon icon_genius"></span>
             <div id="showcase" class="doc-body">
                 <div class="doc-content">
                     <div class="content-inner">
+                        <section id="ScalaNotebook" class="doc-section">
+                            <h2 id="scala-theme-start" class="section-title" style="margin-bottom: 10px;" >Scala notebooks</h2>
+                            <p>
+                                In this section, we present with different example use cases of both training and running
+                                predictions with SparkNLP in Python PySpark. Please lookup our
+                                <a href="http://nlp.johnsnowlabs.com/components.html">Annotators</a> page for reference.
+                            </p>
+                            <div>
+                                <h4 id="scala-vivekn-notebook" class="section-block"> Vivekn Sentiment Analysis</h4>
+                                <p>
+                                    In the following example, we walk-through Sentiment Analysis training and
+                                    prediction using Spark NLP Annotators, Light Pipelines and Spark ML Pipelines
+                                </p>
+                                <p>
+                                    The ViveknSentimentApproach annotator will compute Vivek Narayanan algorithm with either
+                                    a column in training dataset with rows labelled 'positive' or 'negative' or a folder full
+                                    of positive text and a folder with negative text. Using n-grams and negation of sequences,
+                                    this statistical model can achieve high accuracy if trained properly.
+                                </p>
+                                <p>
+                                    In this use case we are training with spark datasets passed to fit() and transform().
+                                    Since we are dealing with small amounts of data, we put in practice LightPipelines.
+                                </p>
+                                <p>
+                                    <a class="btn btn-warning btn-cta" style="float: center;margin-top: 10px;" href="https://github.com/JohnSnowLabs/spark-nlp/blob/1.5.3/example/src/TrainViveknSentiment.scala" target="_blank"> Take me to notebook!</a>
+                                </p>
+                            </div>
+                        </section>
                         <section id="Notebook" class="doc-section">
                             <h2 id="theme-start" class="section-title" style="margin-bottom: 10px;" >Python notebooks</h2>
                             <p>
@@ -243,7 +271,11 @@ <h4 id="downloader-notebook" class="section-block"> Retrieving Pretrained models
                         <ul id="doc-menu" class="nav doc-menu hidden-xs" data-spy="affix">
 
                             <li>
-                                <a class="scrollto" href="#Notebook">Notebook</a>
+                                <a class="scrollto" href="#ScalaNotebook">Scala Notebooks</a>
+                                <ul class="nav doc-sub-menu">
+                                    <li><a class="scrollto" href="#scala-vivekn-notebook">Vivekn Sentiment Analysis</a></li>
+                                </ul>
+                                <a class="scrollto" href="#Notebook">Python Notebooks</a>
                                 <ul class="nav doc-sub-menu">
                                     <li><a class="scrollto" href="#vivekn-notebook">Vivekn Sentiment Analysis</a></li>
                                     <li><a class="scrollto" href="#sentiment-notebook">Rule-based Sentiment Analysis</a></li>
diff --git a/example/src/TrainViveknSentiment.scala b/example/src/TrainViveknSentiment.scala
@@ -0,0 +1,73 @@
+import com.johnsnowlabs.nlp.annotator._
+import com.johnsnowlabs.nlp.base._
+import com.johnsnowlabs.util.Benchmark
+import org.apache.spark.ml.Pipeline
+import org.apache.spark.sql.SparkSession
+
+object TrainViveknSentiment extends App {
+
+  val spark: SparkSession = SparkSession
+    .builder()
+    .appName("test")
+    .master("local[*]")
+    .config("spark.driver.memory", "4G")
+    .config("spark.kryoserializer.buffer.max","200M")
+    .config("spark.serializer","org.apache.spark.serializer.KryoSerializer")
+    .getOrCreate()
+
+  spark.sparkContext.setLogLevel("WARN")
+
+  import spark.implicits._
+
+  val training = Seq(
+    ("I really liked this movie!", "positive"),
+    ("The cast was horrible", "negative"),
+    ("Never going to watch this again or recommend it to anyone", "negative"),
+    ("It's a waste of time", "negative"),
+    ("I loved the protagonist", "positive"),
+    ("The music was really really good", "positive")
+  ).toDS.toDF("train_text", "train_sentiment")
+
+  val testing = Array(
+    "I don't recommend this movie, it's horrible",
+    "Dont waste your time!!!"
+  )
+
+  val document = new DocumentAssembler()
+    .setInputCol("train_text")
+    .setOutputCol("document")
+
+  val token = new Tokenizer()
+    .setInputCols("document")
+    .setOutputCol("token")
+
+  val normalizer = new Normalizer()
+    .setInputCols("token")
+    .setOutputCol("normal")
+
+  val vivekn = new ViveknSentimentApproach()
+    .setInputCols("document", "normal")
+    .setOutputCol("result_sentiment")
+    .setSentimentCol("train_sentiment")
+
+  val finisher = new Finisher()
+    .setInputCols("result_sentiment")
+    .setOutputCols("final_sentiment")
+
+  val pipeline = new Pipeline().setStages(Array(document, token, normalizer, vivekn, finisher))
+
+  val sparkPipeline = pipeline.fit(training)
+
+  val lightPipeline = new LightPipeline(sparkPipeline)
+
+  Benchmark.time("Light pipeline quick annotation") { lightPipeline.annotate(testing) }
+
+  Benchmark.time("Spark pipeline, this may be too much for just two rows!") {
+    val testingDS = testing.toSeq.toDS.toDF("testing_text")
+    println("Updating DocumentAssembler input column")
+    document.setInputCol("testing_text")
+    sparkPipeline.transform(testingDS).show()
+  }
+
+
+}
diff --git a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorApproach.scala
@@ -39,7 +39,7 @@ abstract class AnnotatorApproach[M <: Model[M]]
 
   /** requirement for pipeline transformation validation. It is called on fit() */
   override final def transformSchema(schema: StructType): StructType = {
-    require(validate(schema), s"Wrong annotators in pipeline. Make sure the following annotator types are present in inputCols: " +
+    require(validate(schema), s"Wrong annotators in pipeline for ${this.uid}. Make sure the following annotator types are present in inputCols: " +
       s"${requiredAnnotatorTypes.mkString(", ")}")
     getInputCols.foreach {
       annotationColumn =>
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/vivekn/ViveknSentimentApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/sda/vivekn/ViveknSentimentApproach.scala
@@ -71,6 +71,8 @@ class ViveknSentimentApproach(override val uid: String)
         import ResourceHelper.spark.implicits._
         val positiveDS = new MapAccumulator()
         val negativeDS = new MapAccumulator()
+        dataset.sparkSession.sparkContext.register(positiveDS)
+        dataset.sparkSession.sparkContext.register(negativeDS)
         val prefix = "not_"
         val tokenColumn = dataset.schema.fields
           .find(f => f.metadata.contains("annotatorType") && f.metadata.getString("annotatorType") == AnnotatorType.TOKEN)
diff --git a/src/main/scala/com/johnsnowlabs/util/spark/MapAccumulator.scala b/src/main/scala/com/johnsnowlabs/util/spark/MapAccumulator.scala
@@ -12,12 +12,14 @@ class MapAccumulator(defaultMap: MMap[String, Long] = MMap.empty[String, Long].w
 
   override def add(v: (String, Long)): Unit = mmap(v._1) += v._2
 
-  override def value: Map[String, Long] = mmap.toMap
+  override def value: Map[String, Long] = mmap.toMap.withDefaultValue(0)
 
-  override def copy(): AccumulatorV2[(String, Long), Map[String, Long]] = new MapAccumulator(MMap[String, Long](value.toSeq:_*))
+  override def copy(): AccumulatorV2[(String, Long), Map[String, Long]] =
+    new MapAccumulator(MMap[String, Long](value.toSeq:_*).withDefaultValue(0))
 
   override def isZero: Boolean = mmap.isEmpty
 
-  override def merge(other: AccumulatorV2[(String, Long), Map[String, Long]]): Unit = other.value.foreach{case (k, v) => mmap(k) += v}
+  override def merge(other: AccumulatorV2[(String, Long), Map[String, Long]]): Unit =
+    other.value.foreach{case (k, v) => mmap(k) += v}
 
 }