Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions RELEASENOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,18 @@

Releases, starting with 9/2/2021, are listed with the most recent release at the top.

# NuGet Version 0.102.7

__Breaking Changes__:

A new interface `IDataset<out T>` has been added. (Now `Dataset<T>` implements `IDataset<T>`; `Dataset` implements both `IDataset<Dictionary<string, Tensor>>` and `IDataset<IReadOnlyDictionary<string, Tensor>>`; `IterableDataset` implements `IDataset<IList<string, Tensor>>` and `IDataset<IEnumerable<string, Tensor>>`.)<br/>
`torch.utils.data.ConcatDataset` has been added.<br/>

__API Changes__:

The parameter of `DataLoader`s has been relaxed to `IDataset`.<br/>
The parameter of `DataLoader`s' collate functions has been relaxed to `IReadOnlyList`.<br/>

# NuGet Version 0.102.6

__Breaking Changes__:
Expand Down
108 changes: 108 additions & 0 deletions src/TorchSharp/ConcatDataset.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
// Copyright (c) .NET Foundation and Contributors. All Rights Reserved. See LICENSE in the project root for license information.
using System;
using System.Collections.Generic;
using System.Linq;
using TorchSharp.Modules;

namespace TorchSharp
{
public static partial class torch
{
public static partial class utils
{
public static partial class data
{
public static ConcatDataset<T> ConcatDataset<T>(IEnumerable<IDataset<T>> datasets)
{
return new ConcatDataset<T>(datasets);
}
}
}
}

namespace Modules
{
public class ConcatDataset<T> : torch.utils.data.Dataset<T>
{
private static IEnumerable<long> Cumsum(
IEnumerable<torch.utils.data.IDataset<T>> datasets)
{
var s = 0L;
foreach (var e in datasets) {
s += e.Count;
yield return s;
}
}
private static long bisectRight(long[] a, long x)
{
var lo = 0;
var hi = a.Length;
while (lo < hi) {
var mid = (lo + hi) / 2;
if (x < a[mid])
hi = mid;
else
lo = mid + 1;
}
return lo;
}


private readonly torch.utils.data.IDataset<T>[] _datasets;
public IReadOnlyList<torch.utils.data.IDataset<T>> datasets => _datasets;

private readonly long[] _cumulativeSizes;
public IReadOnlyList<long> cumulative_sizes => _cumulativeSizes;

private readonly bool autoDispose;

public ConcatDataset(
IEnumerable<torch.utils.data.IDataset<T>> datasets,
bool autoDispose = true)
{
this._datasets = datasets.ToArray();
if (this._datasets.Length is 0)
throw new ArgumentException(
"datasets should not be an empty iterable", nameof(datasets));

// PyTorch also says 'ConcatDataset does not support IterableDataset'.
// But it's not our torch.utils.data.IterableDataset in TorchSharp.
this._cumulativeSizes = Cumsum(datasets).ToArray();

this.autoDispose = autoDispose;
}

public override long Count => this._cumulativeSizes.Last();

public override T GetTensor(long index)
{
if (index < 0) {
if (-index > this.Count) {
throw new ArgumentException(
"absolute value of index should not exceed dataset length",
nameof(index));
}
index = this.Count + index;
}

var datasetIdx = bisectRight(this._cumulativeSizes, index);
long sampleIdx;
if (datasetIdx == 0)
sampleIdx = index;
else
sampleIdx = index - this._cumulativeSizes[datasetIdx - 1];
return this._datasets[datasetIdx][sampleIdx];
}

protected override void Dispose(bool disposing)
{
if (disposing && autoDispose) {
foreach (var dataset in this._datasets)
dataset.Dispose();
}

base.Dispose(disposing);
}
}
}
}
52 changes: 30 additions & 22 deletions src/TorchSharp/DataLoader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,8 @@ public static partial class utils
{
public static partial class data
{

public static Modules.DataLoader DataLoader(
Dataset dataset,
IDataset<IReadOnlyDictionary<string, torch.Tensor>> dataset,
int batchSize, IEnumerable<long> shuffler,
Device device = null,
int num_worker = 1, bool drop_last = false,
Expand All @@ -34,7 +33,7 @@ public static Modules.DataLoader DataLoader(
}

public static Modules.DataLoader DataLoader(
Dataset dataset,
IDataset<IReadOnlyDictionary<string, torch.Tensor>> dataset,
int batchSize, bool shuffle = false,
Device device = null, int? seed = null,
int num_worker = 1, bool drop_last = false,
Expand All @@ -49,7 +48,7 @@ public static Modules.DataLoader DataLoader(
}

public static Modules.IterableDataLoader DataLoader(
IterableDataset dataset,
IDataset<IEnumerable<Tensor>> dataset,
int batchSize, IEnumerable<long> shuffler,
Device device = null,
int num_worker = 1, bool drop_last = false,
Expand All @@ -64,7 +63,7 @@ public static Modules.IterableDataLoader DataLoader(
}

public static Modules.IterableDataLoader DataLoader(
IterableDataset dataset,
IDataset<IEnumerable<Tensor>> dataset,
int batchSize, bool shuffle = false,
Device device = null, int? seed = null,
int num_worker = 1, bool drop_last = false,
Expand All @@ -90,7 +89,8 @@ namespace Modules
/// Data loader. Combines a dataset and a sampler, and provides an enumerator over the given dataset.
/// </summary>
/// <remarks>This class is used for map-style data sets</remarks>
public class DataLoader : DataLoader<Dictionary<string, torch.Tensor>, Dictionary<string, torch.Tensor>>
public class DataLoader : DataLoader<IReadOnlyDictionary<string, torch.Tensor>,
Dictionary<string, torch.Tensor>>
{
/// <summary>
/// Pytorch style dataloader
Expand All @@ -111,7 +111,7 @@ public class DataLoader : DataLoader<Dictionary<string, torch.Tensor>, Dictionar
/// Indicates whether to dispose the dataset when being disposed.
/// </param>
public DataLoader(
Dataset dataset,
IDataset<IReadOnlyDictionary<string, torch.Tensor>> dataset,
int batchSize, IEnumerable<long> shuffler,
Device device = null,
int num_worker = 1, bool drop_last = false,
Expand Down Expand Up @@ -144,7 +144,7 @@ public DataLoader(
/// Indicates whether to dispose the dataset when being disposed.
/// </param>
public DataLoader(
Dataset dataset,
IDataset<IReadOnlyDictionary<string, torch.Tensor>> dataset,
int batchSize, bool shuffle = false,
Device device = null, int? seed = null,
int num_worker = 1, bool drop_last = false,
Expand All @@ -157,7 +157,8 @@ public DataLoader(
{
}

private static Dictionary<string, torch.Tensor> Collate(IEnumerable<Dictionary<string, torch.Tensor>> dic, torch.Device device)
private static Dictionary<string, torch.Tensor> Collate(
IEnumerable<IReadOnlyDictionary<string, torch.Tensor>> dic, torch.Device device)
{
using (torch.NewDisposeScope()) {
Dictionary<string, torch.Tensor> batch = new();
Expand All @@ -176,7 +177,8 @@ public DataLoader(
/// Data loader. Combines a dataset and a sampler, and provides an enumerator over the given dataset.
/// </summary>
/// <remarks>This class is used for list-style data sets</remarks>
public class IterableDataLoader : DataLoader<IList<torch.Tensor>, IList<torch.Tensor>>
public class IterableDataLoader :
DataLoader<IEnumerable<torch.Tensor>, IList<torch.Tensor>>
{
/// <summary>
/// Pytorch style dataloader
Expand All @@ -197,7 +199,7 @@ public class IterableDataLoader : DataLoader<IList<torch.Tensor>, IList<torch.Te
/// Indicates whether to dispose the dataset when being disposed.
/// </param>
public IterableDataLoader(
IterableDataset dataset,
IDataset<IEnumerable<Tensor>> dataset,
int batchSize, IEnumerable<long> shuffler,
Device device = null,
int num_worker = 1, bool drop_last = false,
Expand Down Expand Up @@ -230,7 +232,7 @@ public IterableDataLoader(
/// Indicates whether to dispose the dataset when being disposed.
/// </param>
public IterableDataLoader(
IterableDataset dataset,
IDataset<IEnumerable<Tensor>> dataset,
int batchSize, bool shuffle = false,
Device device = null, int? seed = null,
int num_worker = 1, bool drop_last = false,
Expand All @@ -243,12 +245,18 @@ public IterableDataLoader(
{
}

private static IList<torch.Tensor> Collate(IEnumerable<IList<torch.Tensor>> dic, torch.Device device)
private static IList<torch.Tensor> Collate(
IReadOnlyList<IEnumerable<torch.Tensor>> dic, torch.Device device)
{
var dicCopy = new List<torch.Tensor[]>();
foreach (var e in dic) {
dicCopy.Add(e.ToArray());
}

using (torch.NewDisposeScope()) {
List<torch.Tensor> batch = new();
for (var x = 0; x < dic.First().Count; x++) {
var t = cat(dic.Select(k => k[x].unsqueeze(0)).ToArray(), 0);
for (var x = 0; x < dicCopy[0].Length; x++) {
var t = cat(dicCopy.Select(k => k[x].unsqueeze(0)).ToArray(), 0);
if (t.device_type != device.type || t.device_index != device.index)
t = t.to(device);
batch.Add(t.MoveToOuterDisposeScope());
Expand All @@ -264,12 +272,12 @@ public IterableDataLoader(
/// </summary>
public class DataLoader<T, S> : IEnumerable<S>, IDisposable
{
public Dataset<T> dataset { get; }
public IDataset<T> dataset { get; }
public int batch_size { get; }
public bool drop_last { get; }
public IEnumerable<long> sampler { get; }
public int num_workers { get; }
public Func<IEnumerable<T>, Device, S> collate_fn { get; }
public Func<IReadOnlyList<T>, Device, S> collate_fn { get; }

public Device Device { get; }
public bool DisposeBatch { get; }
Expand All @@ -295,9 +303,9 @@ public class DataLoader<T, S> : IEnumerable<S>, IDisposable
/// Indicates whether to dispose the dataset when being disposed.
/// </param>
public DataLoader(
Dataset<T> dataset,
IDataset<T> dataset,
int batchSize,
Func<IEnumerable<T>, torch.Device, S> collate_fn,
Func<IReadOnlyList<T>, torch.Device, S> collate_fn,
IEnumerable<long> shuffler,
Device? device = null,
int num_worker = 1,
Expand Down Expand Up @@ -337,9 +345,9 @@ public DataLoader(
/// Indicates whether to dispose the dataset when being disposed.
/// </param>
public DataLoader(
Dataset<T> dataset,
IDataset<T> dataset,
int batchSize,
Func<IEnumerable<T>, torch.Device, S> collate_fn,
Func<IReadOnlyList<T>, torch.Device, S> collate_fn,
bool shuffle = false,
Device? device = null,
int? seed = null,
Expand Down Expand Up @@ -432,7 +440,7 @@ public bool MoveNext()
.WithDegreeOfParallelism(loader.num_workers)
.ForAll((i) => {
using var getTensorScope = torch.NewDisposeScope();
tensors[i] = loader.dataset.GetTensor(indices[i]);
tensors[i] = loader.dataset[indices[i]];
getTensorDisposables[i] = getTensorScope.DetachAllAndDispose();
});

Expand Down
44 changes: 41 additions & 3 deletions src/TorchSharp/Dataset.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Copyright (c) .NET Foundation and Contributors. All Rights Reserved. See LICENSE in the project root for license information.
using System;
using System.Collections.Generic;
using System.Runtime.CompilerServices;

namespace TorchSharp
{
Expand All @@ -13,21 +14,29 @@ public static partial class data
/// <summary>
/// Map-style data set
/// </summary>
public abstract class Dataset : Dataset<Dictionary<string, torch.Tensor>>
public abstract class Dataset : Dataset<Dictionary<string, Tensor>>,
IDataset<IReadOnlyDictionary<string, Tensor>>
Copy link
Contributor Author

@yueyinqiu yueyinqiu Jul 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As commented in the codes below, due to covariation, Dataset should naturally be IDataset<IReadOnlyDictionary<string, Tensor>> (because it is IDataset<Dictionary<string, Tensor>>). However FSharp.Examples cannot be complied without this. I don't know why this happens.

I would suggest to remove this line, because it influences the detection of torch.utils.data.ConcatDataset (IReadOnlyDictionary or Dictionary are both ok, so it can't automatically choose one), but I failed to make the fsharp program work.

Copy link
Contributor Author

@yueyinqiu yueyinqiu Jul 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A more radical idea is to remove Dataset and IterableDataset at all. Actually we don't need them. DataLoaders could work on any IDataset<IReadOnlyDictionary<string, torch.Tensor>> and IDataset<IEnumerable<Tensor>>.

By the way, our IterableDataset is occupying the position of PyTorch IterableDataset. #1353 That's also part of the reason why I suggest to remove them.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

F# samples have to work.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So. just to check -- with the current commits, the F# examples build, correct? I don't see any errors in the latest build.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I could be built by adding : IDataset<IReadOnlyDictionary<string, Tensor>>.

{
// Due to covariation, it should naturally be IDataset<IReadOnlyDictionary<string, Tensor>>.
// However FSharp.Examples will break down without this.
IReadOnlyDictionary<string, Tensor> IDataset<IReadOnlyDictionary<string, Tensor>>.this[long index] => this[index];
}

/// <summary>
/// Iterable-style data sets
/// </summary>
public abstract class IterableDataset : Dataset<IList<Tensor>>
public abstract class IterableDataset : Dataset<IList<Tensor>>,
IDataset<IEnumerable<Tensor>>
{
// Due to covariation, it should naturally be IDataset<IEnumerable<Tensor>>.
// However FSharp.Examples will break down without this.
IEnumerable<Tensor> IDataset<IEnumerable<Tensor>>.this[long index] => this[index];
}

/// <summary>
/// The base nterface for all Datasets.
/// </summary>
public abstract class Dataset<T> : IDisposable
public abstract class Dataset<T> : IDataset<T>, IDisposable
{
public void Dispose()
{
Expand All @@ -40,6 +49,12 @@ public void Dispose()
/// </summary>
public abstract long Count { get; }

[IndexerName("DatasetItems")]
public T this[long index] => this.GetTensor(index);

// GetTensor is kept for compatibility.
// Perhaps we should remove that and make the indexer abstract later.

/// <summary>
/// Get tensor according to index
/// </summary>
Expand All @@ -49,8 +64,31 @@ public void Dispose()

protected virtual void Dispose(bool disposing)
{
IDataset<Dictionary<string, string>> a = null;
IDataset<IReadOnlyDictionary<string, string>> b = a;
}
}

/// <summary>
/// The base interface for all Datasets.
/// </summary>
public interface IDataset<out T> : IDisposable
{
/// <summary>
/// Size of dataset
/// </summary>
long Count { get; }

/// <summary>
/// Get tensor according to index
/// </summary>
/// <param name="index">Index for tensor</param>
/// <returns>Tensors of index. DataLoader will catenate these tensors into batches.</returns>
[IndexerName("DatasetItems")]
T this[long index] { get; }
Copy link
Contributor Author

@yueyinqiu yueyinqiu Jul 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's using indexers instead of GetTensor in the new interface. I'm not really sure about its IndexerName.

And... shall we remove GetTensor at the same time? Currently Dataset<T> is keeping an abstract GetTensor to ensure compatibility. (And its indexer is calling GetTensor.)


// TODO: support System.Index
}
}
}
}
Expand Down
Loading