TorchSharp/src/Examples/TextClassification.cs at main · svick/TorchSharp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
// Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
using System;
using System.IO;
using System.Linq;
using System.Collections.Generic;
using System.Diagnostics;

using static TorchSharp.torch;
using static TorchSharp.torch.nn;

namespace TorchSharp.Examples
{

    /// <summary>
    /// This example is based on the PyTorch tutorial at:
    ///
    /// https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html
    ///
    /// It relies on the AG_NEWS dataset, which can be downloaded in CSV form at:
    ///
    /// https://github.com/mhjabreel/CharCnn_Keras/tree/master/data/ag_news_csv
    ///
    /// Download the two files, and place them in a folder called "AG_NEWS" in
    /// accordance with the file path below (Windows only).
    ///
    /// </summary>
    public class TextClassification
    {
        private const long emsize = 200;

        private const long batch_size = 128;
        private const long eval_batch_size = 128;

        private const int epochs = 15;

        // This path assumes that you're running this on Windows.
#if NET472_OR_GREATER
        private readonly static string _dataLocation = NSPath.Join(Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory), "..", "Downloads", "AG_NEWS");
#else
        private readonly static string _dataLocation = Path.Join(Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory), "..", "Downloads", "AG_NEWS");
#endif // NET472_OR_GREATER
        internal static void Main(string[] args)

        {
            torch.random.manual_seed(1);

            var cwd = Environment.CurrentDirectory;

            var device = torch.cuda.is_available() ? torch.CUDA : torch.CPU;
            Console.WriteLine($"Running TextClassification on {device.type.ToString()}");

            using (var reader = TorchText.Data.AG_NEWSReader.AG_NEWS("train", (Device)device, _dataLocation)) {

                var dataloader = reader.Enumerate();

                var tokenizer = TorchText.Data.Utils.get_tokenizer("basic_english");

                var counter = new TorchText.Vocab.Counter<string>();
                foreach (var (label, text) in dataloader) {
                    counter.update(tokenizer(text));
                }

                var vocab = new TorchText.Vocab.Vocab(counter);

                var model = new TextClassificationModel(vocab.Count, emsize, 4).to((Device)device);

                var loss = CrossEntropyLoss();
                var lr = 5.0;
                var optimizer = torch.optim.SGD(model.parameters(), lr);
                var scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, 0.2, last_epoch: 5);

                // This data set is small enough that we can get away with
                // collecting memory only once per epoch.

                using (var d = torch.NewDisposeScope()) {

                    foreach (var epoch in Enumerable.Range(1, epochs)) {

                        var sw = new Stopwatch();
                        sw.Start();

                        train(epoch, reader.GetBatches(tokenizer, vocab, batch_size), model, loss, optimizer);

                        sw.Stop();

                        var pgFirst = optimizer.ParamGroups.First();

                        Console.WriteLine($"\nEnd of epoch: {epoch} | lr: {pgFirst.LearningRate:0.00} | time: {sw.Elapsed.TotalSeconds:0.0}s\n");
                        scheduler.step();
                    }
                }

                using (var d = torch.NewDisposeScope()) {

                    using (var test_reader = TorchText.Data.AG_NEWSReader.AG_NEWS("test", (Device)device, _dataLocation)) {

                        var sw = new Stopwatch();
                        sw.Start();

                        var accuracy = evaluate(test_reader.GetBatches(tokenizer, vocab, eval_batch_size), model, loss);

                        sw.Stop();

                        Console.WriteLine($"\nEnd of training: test accuracy: {accuracy:0.00} | eval time: {sw.Elapsed.TotalSeconds:0.0}s\n");
                        scheduler.step();
                    }
                }
            }
        }

        static void train(int epoch, IEnumerable<(Tensor, Tensor, Tensor)> train_data, TextClassificationModel model, Loss<torch.Tensor, torch.Tensor, torch.Tensor> criterion, torch.optim.Optimizer optimizer)
        {
            model.train();

            double total_acc = 0.0;
            long total_count = 0;
            long log_interval = 250;

            var batch = 0;

            var batch_count = train_data.Count();

            foreach (var (labels, texts, offsets) in train_data) {

                optimizer.zero_grad();

                using (var predicted_labels = model.call(texts, offsets)) {

                    var loss = criterion.call(predicted_labels, labels);
                    loss.backward();
                    torch.nn.utils.clip_grad_norm_(model.parameters().ToArray(), 0.5);
                    optimizer.step();

                    total_acc += (predicted_labels.argmax(1) == labels).sum().to(torch.CPU).item<long>();
                    total_count += labels.size(0);
                }

                if (batch % log_interval == 0 && batch > 0) {
                    var accuracy = total_acc / total_count;
                    Console.WriteLine($"epoch: {epoch} | batch: {batch} / {batch_count} | accuracy: {accuracy:0.00}");
                }

                batch += 1;
            }
        }

        static double evaluate(IEnumerable<(Tensor, Tensor, Tensor)> test_data, TextClassificationModel model, Loss<Tensor, Tensor, Tensor> criterion)
        {
            model.eval();

            double total_acc = 0.0;
            long total_count = 0;

            foreach (var (labels, texts, offsets) in test_data) {

                using (var predicted_labels = model.call(texts, offsets)) {
                    var loss = criterion.call(predicted_labels, labels);

                    total_acc += (predicted_labels.argmax(1) == labels).sum().to(torch.CPU).item<long>();
                    total_count += labels.size(0);
                }
            }

            return total_acc / total_count;
        }
    }

    class TextClassificationModel : Module<Tensor, Tensor>
    {
        private Modules.EmbeddingBag embedding;
        private Modules.Linear fc;

        public TextClassificationModel(long vocab_size, long embed_dim, long num_class) : base("TextClassification")
        {
            embedding = EmbeddingBag(vocab_size, embed_dim, sparse: false);
            fc = Linear(embed_dim, num_class);
            InitWeights();

            RegisterComponents();
        }

        private void InitWeights()
        {
            var initrange = 0.5;

            init.uniform_(embedding.weight, -initrange, initrange);
            init.uniform_(fc.weight, -initrange, initrange);
            init.zeros_(fc.bias);
        }

        public override Tensor forward(Tensor t)
        {
            throw new NotImplementedException();
        }

        public Tensor call(Tensor input, Tensor offsets)
        {
            return fc.call(embedding.call(input, offsets));
        }

        protected override void Dispose(bool disposing)
        {
            if (disposing) {
                embedding.Dispose();
                fc.Dispose();
                ClearModules();
            }
            base.Dispose(disposing);
        }
    }
}