Add a sample for Onnx conversion (#5195)

wangyems · web-flow · commit 9244e683d85f · 2020-06-08T12:24:57.000-07:00
* initial checkin

* remove unused using directives

* temp

* resolve comments

* more comments

* update

* remove complicated sample
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/ModelOperations/OnnxConversion.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/ModelOperations/OnnxConversion.cs
@@ -0,0 +1,106 @@
+﻿using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using Microsoft.ML;
+using Microsoft.ML.Data;
+
+namespace Samples.Dynamic.ModelOperations
+{
+    public static class OnnxConversion
+    {
+        private class ScoreValue
+        {
+            public float Score { get; set; }
+        }
+
+        private class OnnxScoreValue
+        {
+            public VBuffer<float> Score { get; set; }
+        }
+
+        private static void PrintScore(IEnumerable<ScoreValue> values, int numRows)
+        {
+            foreach (var value in values.Take(numRows))
+                Console.WriteLine("{0, -10} {1, -10}", "Score", value.Score);
+        }
+
+        private static void PrintScore(IEnumerable<OnnxScoreValue> values, int numRows)
+        {
+            foreach (var value in values.Take(numRows))
+                Console.WriteLine("{0, -10} {1, -10}", "Score", value.Score.GetItemOrDefault(0));
+        }
+
+        public static void Example()
+        {
+            var mlContext = new MLContext(seed: 0);
+
+            //Get dataset
+            // Download the raw dataset.
+            var originalData = Microsoft.ML.SamplesUtils.DatasetUtils
+                .LoadRawAdultDataset(mlContext);
+
+            //Dataset partition
+            // Partition the original dataset. Leave out 10% of data for testing.
+            var trainTestOriginalData = mlContext.Data
+                .TrainTestSplit(originalData, testFraction: 0.3);
+
+            // Define training pielines(wholePipeline = featurizationPipeline + binaryRegressionpipeline)
+            var wholePipeline = mlContext.Transforms.CopyColumns("Label", "IsOver50K")
+                        // Convert categorical features to one-hot vectors
+                        .Append(mlContext.Transforms.Categorical.OneHotEncoding("workclass"))
+                        .Append(mlContext.Transforms.Categorical.OneHotEncoding("education"))
+                        .Append(mlContext.Transforms.Categorical.OneHotEncoding("marital-status"))
+                        .Append(mlContext.Transforms.Categorical.OneHotEncoding("occupation"))
+                        .Append(mlContext.Transforms.Categorical.OneHotEncoding("relationship"))
+                        .Append(mlContext.Transforms.Categorical.OneHotEncoding("ethnicity"))
+                        .Append(mlContext.Transforms.Categorical.OneHotEncoding("native-country"))
+                        // Combine all features into one feature vector
+                        .Append(mlContext.Transforms.Concatenate("Features", "workclass", "education", "marital-status",
+                        "occupation", "relationship", "ethnicity", "native-country", "age", "education-num",
+                        "capital-gain", "capital-loss", "hours-per-week"))
+                        // Min-max normalize all the features
+                        .Append(mlContext.Transforms.NormalizeMinMax("Features"))
+                        .Append(mlContext.BinaryClassification.Trainers.AveragedPerceptron());
+
+            // Fit the pipeline, and get a transformer that knows how to score new data
+            var transformer = wholePipeline.Fit(trainTestOriginalData.TrainSet);
+
+            //What you need to convert an ML.NET model to an onnx model is a transformer and input data
+            //By default, the onnx conversion will generate the onnx file with the latest OpSet version
+            using (var stream = File.Create("sample_onnx_conversion_1.onnx"))
+                mlContext.Model.ConvertToOnnx(transformer, originalData, stream);
+
+            //However, you can also specify a custom OpSet version by using the following code
+            //Currently, we support OpSet versions 9 for most transformers, but there are certain transformers that require a higher OpSet version
+            //Please refer to the following link for most update information of what OpSet version we support
+            //https://github.com/dotnet/machinelearning/blob/master/src/Microsoft.ML.OnnxConverter/OnnxExportExtensions.cs
+            int customOpSetVersion = 9;
+            using (var stream = File.Create("sample_onnx_conversion_2.onnx"))
+                mlContext.Model.ConvertToOnnx(transformer, originalData, customOpSetVersion, stream);
+
+            //Create the pipeline using onnx file.
+            var onnxModelPath = "your_path_to_sample_onnx_conversion_1.onnx";
+            var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(onnxModelPath);
+            var onnxTransformer = onnxEstimator.Fit(trainTestOriginalData.TrainSet);
+
+            //Inference the testset
+            var output = transformer.Transform(trainTestOriginalData.TestSet);
+            var onnxOutput = onnxTransformer.Transform(trainTestOriginalData.TestSet);
+
+            //Get the outScores
+            var outScores = mlContext.Data.CreateEnumerable<ScoreValue>(output, reuseRowObject: false);
+            var onnxOutScores = mlContext.Data.CreateEnumerable<OnnxScoreValue>(onnxOutput, reuseRowObject: false);
+
+            //Print
+            PrintScore(outScores, 5);
+            PrintScore(onnxOutScores, 5);
+            //Expected same results for the above 4 methods
+            //Score - 0.09044361
+            //Score - 9.105377
+            //Score - 11.049
+            //Score - 3.061928
+            //Score - 6.375817
+        }
+    }
+}
diff --git a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj
@@ -29,6 +29,7 @@
     <ProjectReference Include="..\..\..\src\Microsoft.ML.TimeSeries\Microsoft.ML.TimeSeries.csproj" />
     <ProjectReference Include="..\..\..\src\Microsoft.ML.DnnImageFeaturizer.ResNet18\Microsoft.ML.DnnImageFeaturizer.ResNet18.csproj" />
     <ProjectReference Include="..\..\..\src\Microsoft.ML.Transforms\Microsoft.ML.Transforms.csproj" />
+    <ProjectReference Include="..\..\..\src\Microsoft.ML.OnnxConverter\Microsoft.ML.OnnxConverter.csproj" />
 
     <NativeAssemblyReference Include="CpuMathNative" />
     <NativeAssemblyReference Include="FastTreeNative" />
diff --git a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs
@@ -82,6 +82,38 @@ public static IDataView LoadHousingRegressionDataset(MLContext mlContext)
         /// For more details about this dataset, please see https://archive.ics.uci.edu/ml/datasets/adult.
         /// </remarks>
         public static IDataView LoadFeaturizedAdultDataset(MLContext mlContext)
+        {
+            // Create data featurizing pipeline
+            var pipeline = mlContext.Transforms.CopyColumns("Label", "IsOver50K")
+                // Convert categorical features to one-hot vectors
+                .Append(mlContext.Transforms.Categorical.OneHotEncoding("workclass"))
+                .Append(mlContext.Transforms.Categorical.OneHotEncoding("education"))
+                .Append(mlContext.Transforms.Categorical.OneHotEncoding("marital-status"))
+                .Append(mlContext.Transforms.Categorical.OneHotEncoding("occupation"))
+                .Append(mlContext.Transforms.Categorical.OneHotEncoding("relationship"))
+                .Append(mlContext.Transforms.Categorical.OneHotEncoding("ethnicity"))
+                .Append(mlContext.Transforms.Categorical.OneHotEncoding("native-country"))
+                // Combine all features into one feature vector
+                .Append(mlContext.Transforms.Concatenate("Features", "workclass", "education", "marital-status",
+                    "occupation", "relationship", "ethnicity", "native-country", "age", "education-num",
+                    "capital-gain", "capital-loss", "hours-per-week"))
+                // Min-max normalize all the features
+                .Append(mlContext.Transforms.NormalizeMinMax("Features"));
+
+            var data = LoadRawAdultDataset(mlContext);
+            var featurizedData = pipeline.Fit(data).Transform(data);
+            return featurizedData;
+        }
+
+        /// <summary>
+        /// Returns the path to the Adult UCI dataset and featurizes it to be suitable for classification tasks.
+        /// </summary>
+        /// <param name="mlContext"><see cref="MLContext"/> used for data loading and processing.</param>
+        /// <returns>Raw dataset.</returns>
+        /// <remarks>
+        /// For more details about this dataset, please see https://archive.ics.uci.edu/ml/datasets/adult.
+        /// </remarks>
+        public static IDataView LoadRawAdultDataset(MLContext mlContext)
         {
             // Obtains the path to the file
             string dataFile = GetAdultDataset();
@@ -103,33 +135,14 @@ public static IDataView LoadFeaturizedAdultDataset(MLContext mlContext)
                         new TextLoader.Column("capital-gain", DataKind.Single, 10),
                         new TextLoader.Column("capital-loss", DataKind.Single, 11),
                         new TextLoader.Column("hours-per-week", DataKind.Single, 12),
-                        new TextLoader.Column("native-country", DataKind.Single, 13),
+                        new TextLoader.Column("native-country", DataKind.String, 13),
                         new TextLoader.Column("IsOver50K", DataKind.Boolean, 14),
                     },
                 separatorChar: ',',
                 hasHeader: true
             );
 
-            // Create data featurizing pipeline
-            var pipeline = mlContext.Transforms.CopyColumns("Label", "IsOver50K")
-                // Convert categorical features to one-hot vectors
-                .Append(mlContext.Transforms.Categorical.OneHotEncoding("workclass"))
-                .Append(mlContext.Transforms.Categorical.OneHotEncoding("education"))
-                .Append(mlContext.Transforms.Categorical.OneHotEncoding("marital-status"))
-                .Append(mlContext.Transforms.Categorical.OneHotEncoding("occupation"))
-                .Append(mlContext.Transforms.Categorical.OneHotEncoding("relationship"))
-                .Append(mlContext.Transforms.Categorical.OneHotEncoding("ethnicity"))
-                .Append(mlContext.Transforms.Categorical.OneHotEncoding("native-country"))
-                // Combine all features into one feature vector
-                .Append(mlContext.Transforms.Concatenate("Features", "workclass", "education", "marital-status",
-                    "occupation", "relationship", "ethnicity", "native-country", "age", "education-num",
-                    "capital-gain", "capital-loss", "hours-per-week"))
-                // Min-max normalize all the features
-                .Append(mlContext.Transforms.NormalizeMinMax("Features"));
-
-            var data = loader.Load(dataFile);
-            var featurizedData = pipeline.Fit(data).Transform(data);
-            return featurizedData;
+            return loader.Load(dataFile);
         }
 
         /// <summary>