|
7 | 7 | import json |
8 | 8 | from src.model import GPT, GPTConfig |
9 | 9 | from src.trainer import Trainer, TrainerConfig |
10 | | -from torch.utils.data import Dataset |
| 10 | +from src.utils import Dataset |
11 | 11 | import torch |
12 | 12 | import numpy as np |
13 | 13 | torch.backends.cudnn.benchmark = True |
14 | 14 | torch.backends.cudnn.allow_tf32 = True |
15 | | -torch.backends.cuda.matmul.allow_tf32 = True |
| 15 | +torch.backends.cuda.matmul.allow_tf32 = True |
16 | 16 |
|
17 | 17 | ### Step 1: set training data ########################################################################## |
18 | 18 |
|
|
36 | 36 | # If you see "CUDA out of memory", reduce it. Use GPU-Z to find the highest value for your VRAM. |
37 | 37 | batch_size = 12 |
38 | 38 |
|
39 | | -### Step 4: set learning rate, training 'epochs' ####################################################### |
| 39 | +### Step 4: set learning rate, training mini-epochs ####################################################### |
40 | 40 |
|
41 | 41 | lr_init = 6e-4 |
42 | 42 | lr_final = 1e-5 |
43 | | -# the 'epoch' here is very short and of fixed length (ctx_len * epoch_length_fixed tokens) |
| 43 | +# the mini-epoch is very short and of fixed length (ctx_len * epoch_length_fixed tokens) |
44 | 44 | n_epoch = 500 |
45 | | -# 0 = never, 1 = every 'epoch', 2 = every two 'epoch', etc. |
| 45 | +# 0 = never, 1 = every mini-epoch, 2 = every two mini-epochs, etc. |
46 | 46 | epoch_save_frequency = 30 |
47 | 47 | epoch_save_path = 'trained-' |
48 | 48 |
|
49 | 49 | epoch_length_fixed = 10000 |
50 | 50 |
|
51 | 51 | ######################################################################################################## |
52 | 52 |
|
53 | | - |
54 | 53 | # import src.utils |
55 | 54 | # src.utils.set_seed(42) # remember to change seed if you load a model |
56 | 55 |
|
|
71 | 70 | ######################################################################################################## |
72 | 71 |
|
73 | 72 | print('loading data... ' + datafile) |
74 | | - |
75 | | - |
76 | | -class Dataset(Dataset): |
77 | | - def __init__(self, data, ctx_len): |
78 | | - print('building token list...', end=' ') |
79 | | - unique = sorted(list(set(data))) |
80 | | - # print() |
81 | | - # for u in unique: |
82 | | - # print(u, end=' ') |
83 | | - # print('\n\n') |
84 | | - |
85 | | - xx = 0 |
86 | | - xxObj = {} |
87 | | - for u in unique: |
88 | | - xxObj[xx] = u |
89 | | - xx += 1 |
90 | | - with open('vocab.json', "w", encoding="utf-16") as vocab_file: |
91 | | - vocab_file.write(json.dumps(xxObj, ensure_ascii=False)) |
92 | | - |
93 | | - data_size, vocab_size = len(data), len(unique) |
94 | | - print('data has %d tokens, %d unique.' % (data_size, vocab_size)) |
95 | | - self.stoi = {ch: i for i, ch in enumerate(unique)} |
96 | | - self.itos = {i: ch for i, ch in enumerate(unique)} |
97 | | - self.ctx_len = ctx_len |
98 | | - self.vocab_size = vocab_size |
99 | | - self.data = data |
100 | | - |
101 | | - def __len__(self): |
102 | | - return epoch_length_fixed |
103 | | - |
104 | | - def __getitem__(self, idx): |
105 | | - # cheat: pick a random spot in dataset |
106 | | - i = np.random.randint(0, len(self.data) - (self.ctx_len + 1)) |
107 | | - chunk = self.data[i:i+self.ctx_len+1] |
108 | | - dix = [self.stoi[s] for s in chunk] |
109 | | - x = torch.tensor(dix[:-1], dtype=torch.long, |
110 | | - device=torch.device('cuda')) |
111 | | - y = torch.tensor(dix[1:], dtype=torch.long, |
112 | | - device=torch.device('cuda')) |
113 | | - return x, y |
114 | | - |
115 | | - |
116 | | -train_dataset = Dataset( |
117 | | - open(datafile, "r", encoding=datafile_encoding).read(), ctx_len) |
| 73 | +train_dataset = Dataset(open( |
| 74 | + datafile, "r", encoding=datafile_encoding).read(), ctx_len, epoch_length_fixed) |
118 | 75 |
|
119 | 76 | ######################################################################################################## |
120 | 77 | # Train model |
|
0 commit comments