-
Notifications
You must be signed in to change notification settings - Fork 23
Expand file tree
/
Copy pathhyperparams.yml
More file actions
63 lines (51 loc) · 3.22 KB
/
hyperparams.yml
File metadata and controls
63 lines (51 loc) · 3.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# Where to find data
data_path: '' # Path expression to tf.Example datafiles. Can include wildcards to access multiple datafiles.
vocab_path: '' # Path expression to text vocabulary file.
# Important settings
mode: 'train' # one of ['train', 'eval', 'decode']
# For decode mode only. If True, run eval on the full dataset using a fixed checkpoint,
# i.e. take the current checkpoint, and use it to produce one summary for each example
# in the dataset, write the summaries to file and then get ROUGE scores for the whole
# dataset. If False (default), run concurrent decoding, i.e. repeatedly load latest
# checkpoint, use it to produce summaries for randomly-chosen examples and log the
# results to screen, indefinitely.
single_pass: False
# Where to save output
log_root: '' # Root directory for all logging.
experiment_name: '' # Name for experiment. Logs will be saved in a directory with this name, under log_root.
# Hyperparameters
hidden_dim: 256 # dimension of RNN hidden states
emb_dim: 128 # dimension of word embeddings
batch_size: 16 # minibatch size
max_enc_steps: 400 # max timesteps of encoder (max source text tokens)
max_dec_steps: 100 # max timesteps of decoder (max summary tokens)
beam_size: 4 # beam size for beam search decoding
min_dec_steps: 35 # Minimum sequence length of generated summary. Applies only for beam search decoding mode.
# Size of vocabulary. These will be read from the vocabulary file in order. If the vocabulary file contains
# fewer words than this number, or if this number is set to 0, will take all words in the vocabulary file.
vocab_size: 50000
#learning_rate: 0.15
learning_rate: 0.1
adagrad_init_acc: 0.1 # initial accumulator value for Adagrad
rand_unif_init_mag: 0.02 # magnitude for LSTM cells' random uniform initialization
trunc_norm_init_std: 10000 # std of trunc norm init, used for initializing everything else
max_grad_norm: 2.0 # for gradient clipping
# Select pointer-generator or baseline model
pointer_gen: True # If True, use pointer-generator model. If False, use baseline model.
# Coverage hyperparameters
# Use coverage mechanism. Note, the experiments reported in the ACL paper train WITHOUT coverage
# until converged, and then train for a short phase WITH coverage afterwards. i.e. to reproduce
# the results in the ACL paper, turn this off for most of training then turn on for a short phase
# at the end.
coverage: False
cov_loss_wt: 1.0 # Weight of coverage loss (lambda in the paper). If zero, then no incentive to minimize coverage loss.
# Utility flags for restoring and changing checkpoints
# Convert a non-coverage model to a coverage model. Turn this on and run in train mode. Your current
# training model will be copied to a new version (same name with _cov_init appended) that will be
# ready to run with coverage flag turned on, for the coverage training stage.
convert_to_coverage_model: False
# Restore the best model in the eval/ dir and save it in the train/ dir, ready to be used for further
# training. Useful for early stopping, or if your training checkpoint has become corrupted with e.g. NaN values.
restore_best_model: False
# Debugging. See https://www.tensorflow.org/programmers_guide/debugger
debug: False # Run in tensorflow's debug mode (watches for NaN/inf values)