dltemplate/src/tf_model/pointer_generator/hyperparams.yml at master · markmo/dltemplate · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# Where to find data
data_path: ''  # Path expression to tf.Example datafiles. Can include wildcards to access multiple datafiles.
vocab_path: ''  # Path expression to text vocabulary file.

# Important settings
mode: 'train'  # one of ['train', 'eval', 'decode']

# For decode mode only. If True, run eval on the full dataset using a fixed checkpoint,
# i.e. take the current checkpoint, and use it to produce one summary for each example
# in the dataset, write the summaries to file and then get ROUGE scores for the whole
# dataset. If False (default), run concurrent decoding, i.e. repeatedly load latest
# checkpoint, use it to produce summaries for randomly-chosen examples and log the
# results to screen, indefinitely.
single_pass: False

# Where to save output
log_root: ''  # Root directory for all logging.
experiment_name: ''  # Name for experiment. Logs will be saved in a directory with this name, under log_root.

# Hyperparameters
hidden_dim: 256  # dimension of RNN hidden states
emb_dim: 128  # dimension of word embeddings
batch_size: 16  # minibatch size
max_enc_steps: 400  # max timesteps of encoder (max source text tokens)
max_dec_steps: 100  # max timesteps of decoder (max summary tokens)
beam_size: 4  # beam size for beam search decoding
min_dec_steps: 35  # Minimum sequence length of generated summary. Applies only for beam search decoding mode.

# Size of vocabulary. These will be read from the vocabulary file in order. If the vocabulary file contains
# fewer words than this number, or if this number is set to 0, will take all words in the vocabulary file.
vocab_size: 50000
#learning_rate: 0.15
learning_rate: 0.1
adagrad_init_acc: 0.1  # initial accumulator value for Adagrad
rand_unif_init_mag: 0.02  # magnitude for LSTM cells' random uniform initialization
trunc_norm_init_std: 10000  # std of trunc norm init, used for initializing everything else
max_grad_norm: 2.0  # for gradient clipping

# Select pointer-generator or baseline model
pointer_gen: True  # If True, use pointer-generator model. If False, use baseline model.

# Coverage hyperparameters

# Use coverage mechanism. Note, the experiments reported in the ACL paper train WITHOUT coverage
# until converged, and then train for a short phase WITH coverage afterwards. i.e. to reproduce
# the results in the ACL paper, turn this off for most of training then turn on for a short phase
# at the end.
coverage: False
cov_loss_wt: 1.0  # Weight of coverage loss (lambda in the paper). If zero, then no incentive to minimize coverage loss.

# Utility flags for restoring and changing checkpoints

# Convert a non-coverage model to a coverage model. Turn this on and run in train mode. Your current
# training model will be copied to a new version (same name with _cov_init appended) that will be
# ready to run with coverage flag turned on, for the coverage training stage.
convert_to_coverage_model: False

# Restore the best model in the eval/ dir and save it in the train/ dir, ready to be used for further
# training. Useful for early stopping, or if your training checkpoint has become corrupted with e.g. NaN values.
restore_best_model: False

# Debugging. See https://www.tensorflow.org/programmers_guide/debugger
debug: False  # Run in tensorflow's debug mode (watches for NaN/inf values)