Skip to content

Commit 2f125b9

Browse files
committed
add DecodePayload function
1 parent f548906 commit 2f125b9

6 files changed

Lines changed: 328 additions & 7 deletions

File tree

arc.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,33 @@ func (u *url1) Fields() map[string][]string {
9595
func (u *url1) IP() string { return u.ip }
9696
func (u *url1) MIME() string { return u.mime }
9797

98+
func (u *url1) transferEncodings() []string {
99+
if len(u.fields) == 0 {
100+
return nil
101+
}
102+
vals := getSelectValues(u.fields, "Transfer-Encoding")
103+
if vals[0] == "" {
104+
return nil
105+
}
106+
return splitAndReverse(vals[0])
107+
}
108+
func (u *url1) encodings() []string {
109+
if len(u.fields) == 0 {
110+
return nil
111+
}
112+
vals := getSelectValues(u.fields, "Transfer-Encoding", "Content-Encoding")
113+
if vals[0] == "" {
114+
if vals[1] == "" {
115+
return nil
116+
}
117+
return splitAndReverse(vals[1])
118+
}
119+
if vals[1] == "" {
120+
return splitAndReverse(vals[0])
121+
}
122+
return append(splitAndReverse(vals[0]), splitAndReverse(vals[1])...)
123+
}
124+
98125
func (u *url1) size() int64 { return u.sz }
99126
func (u *url1) setfields(f []byte) { u.fields = f }
100127

decode.go

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
// Copyright 2015 Richard Lehane. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
// ransfer-Encoding The form of encoding used to safely transfer the entity to the user. Currently defined methods are: chunked, compress, deflate, gzip, identity.
16+
17+
/*
18+
The official list of tokens available to servers and client is maintained by IANA,[4] and it includes:
19+
20+
compress – UNIX "compress" program method (historic; deprecated in most applications and replaced by gzip or deflate)
21+
*deflate – compression based on the deflate algorithm (described in RFC 1951), wrapped inside the zlib data format (RFC 1950);
22+
exi – W3C Efficient XML Interchange
23+
*gzip – GNU zip format (described in RFC 1952). This method is the most broadly supported as of March 2011.[5]
24+
*identity – No transformation is used. This is the default value for content coding.
25+
pack200-gzip – Network Transfer Format for Java Archives[6]
26+
27+
*/
28+
package webarchive
29+
30+
import (
31+
"compress/gzip"
32+
"compress/zlib"
33+
"io"
34+
"net/http/httputil"
35+
)
36+
37+
func isgzip(buf []byte) bool {
38+
if buf[0] != 0x1f || buf[1] != 0x8b || buf[2] != 8 {
39+
return false
40+
}
41+
return true
42+
}
43+
44+
const zlibDeflate = 8
45+
46+
func iszlib(buf []byte) bool {
47+
h := uint(buf[0])<<8 | uint(buf[1])
48+
if (buf[0]&0x0f != zlibDeflate) || (h%31 != 0) {
49+
return false
50+
}
51+
return true
52+
}
53+
54+
func ischunk(buf []byte) bool {
55+
for i, c := range buf {
56+
switch {
57+
case '0' >= c && c <= '9':
58+
continue
59+
case 'a' <= c && c <= 'f':
60+
continue
61+
case 'A' <= c && c <= 'F':
62+
continue
63+
case c == '\r':
64+
if i > 0 && i < len(buf)-1 && buf[i+1] == '\n' {
65+
return true
66+
}
67+
return false
68+
default:
69+
return false
70+
}
71+
}
72+
return false
73+
}
74+
75+
type payloadDecoder struct {
76+
Record
77+
rdr io.Reader
78+
}
79+
80+
func (pd *payloadDecoder) Read(b []byte) (int, error) {
81+
return pd.rdr.Read(b)
82+
}
83+
84+
func (pd *payloadDecoder) IsSlicer() bool {
85+
return false
86+
}
87+
88+
func newDecoder(rec Record, encodings []string) Record {
89+
if len(encodings) == 0 {
90+
return rec
91+
}
92+
pd := &payloadDecoder{Record: rec, rdr: rec}
93+
for i, v := range encodings {
94+
switch v {
95+
case "chunked":
96+
if i == 0 {
97+
if peek, err := rec.peek(10); err != nil || !ischunk(peek) {
98+
return rec
99+
}
100+
}
101+
pd.rdr = httputil.NewChunkedReader(pd.rdr)
102+
case "deflate":
103+
if i == 0 {
104+
if peek, err := rec.peek(2); err != nil || !iszlib(peek) {
105+
return rec
106+
}
107+
}
108+
rdr, err := zlib.NewReader(pd.rdr)
109+
if err == nil {
110+
pd.rdr = rdr
111+
}
112+
case "gzip":
113+
if i == 0 {
114+
if peek, err := rec.peek(3); err != nil || !isgzip(peek) {
115+
return rec
116+
}
117+
}
118+
rdr, err := gzip.NewReader(pd.rdr)
119+
if err == nil {
120+
pd.rdr = rdr
121+
}
122+
}
123+
}
124+
return pd
125+
}
126+
127+
// DecodePayload decodes any transfer or content encodings declared in the HTTP headers
128+
// of a record. Decodes chunked, deflate and gzip encodings.
129+
func DecodePayload(r Record) Record {
130+
return newDecoder(r, r.encodings())
131+
}
132+
133+
// DecodePayload decodes any transfer or content encodings declared in the HTTP headers
134+
// of a record. Decodes chunked, deflate and gzip encodings.
135+
func TransferDecodePayload(r Record) Record {
136+
return newDecoder(r, r.transferEncodings())
137+
}

reader.go

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,10 @@ func (r *reader) Read(p []byte) (int, error) {
6868
return l, err
6969
}
7070

71+
func (r *reader) IsSlicer() bool {
72+
return r.slicer
73+
}
74+
7175
// Slice returns a byte slice with size l from a given offset from the start of the content of the record.
7276
// When iterating with NextPayload, the slice zero offset starts after any stripped HTTP headers. Otherwise,
7377
// the zero offset is immediately after the WARC or ARC header block.
@@ -146,13 +150,6 @@ func (r *reader) reset(s io.Reader) error {
146150
return r.unzip()
147151
}
148152

149-
func isgzip(buf []byte) bool {
150-
if buf[0] != 0x1f || buf[1] != 0x8b || buf[2] != 8 {
151-
return false
152-
}
153-
return true
154-
}
155-
156153
func (r *reader) unzip() error {
157154
if buf, err := r.srcpeek(3); err == nil && isgzip(buf) {
158155
var rdr io.Reader = r.sbuf
@@ -396,6 +393,15 @@ func normaliseKey(k []byte) string {
396393
return s
397394
}
398395

396+
func splitAndReverse(s string) []string {
397+
vals := strings.Split(s, ",")
398+
ret := make([]string, len(vals))
399+
for i := range ret {
400+
ret[i] = strings.TrimSpace(vals[len(vals)-i-1])
401+
}
402+
return ret
403+
}
404+
399405
func getSelectValues(buf []byte, vals ...string) []string {
400406
ret := make([]string, len(vals))
401407
lines := getLines(buf)
@@ -547,6 +553,10 @@ func (c *continuation) Read(p []byte) (int, error) {
547553
return l, err
548554
}
549555

556+
func (c *continuation) IsSlicer() bool {
557+
return true
558+
}
559+
550560
func (c *continuation) Slice(off int64, l int) ([]byte, error) {
551561
if c.start+int(off) >= len(c.buf) {
552562
return nil, io.EOF
@@ -571,3 +581,7 @@ func (c *continuation) EofSlice(off int64, l int) ([]byte, error) {
571581
}
572582
return c.buf[o:l], err
573583
}
584+
585+
func (c *continuation) peek(i int) ([]byte, error) {
586+
return c.Slice(0, i)
587+
}

utils/warcscan/warcscan.go

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
// Copyright 2015 Richard Lehane. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
// Warcscan is a simple script to enable searching WARC files and retrieving
16+
// individual WARC records. Built it to generate test files with interesting
17+
// characteristics e.g. presence of continuations or Content-Encoding.
18+
//
19+
// This is a very basic implementation that just scans on "WARC/" to split WARC
20+
// files. Because using bufio.Scanner, cannot retrieve very large WARC records
21+
// in this way (60kb is defined as limit to prevent panics).
22+
//
23+
// Once built, use e.g. `warcscan -s termone,termtwo -a -o output.warc input.warc`
24+
// The -s flag is a comma-separated list of search terms.
25+
// The -a flag says all terms must be matched (otherwise is an OR search).
26+
// The -o flag names the output file (defaults to out.warc)
27+
package main
28+
29+
import (
30+
"bufio"
31+
"bytes"
32+
"flag"
33+
"fmt"
34+
"io/ioutil"
35+
"os"
36+
)
37+
38+
var (
39+
search = flag.String("s", "", "enter comma-separated search terms")
40+
out = flag.String("o", "out.warc", "enter name of output file")
41+
all = flag.Bool("a", false, "all search terms must match")
42+
)
43+
44+
var marker = []byte("WARC/")
45+
46+
const maxbuf = 60000
47+
48+
func main() {
49+
flag.Parse()
50+
51+
searchStrings := bytes.Split([]byte(*search), []byte(","))
52+
if len(searchStrings) == 0 {
53+
fmt.Println("must provide comma-separated search terms for scanning, with -s flag")
54+
os.Exit(0)
55+
}
56+
57+
if flag.NArg() != 1 {
58+
fmt.Println("must provide an input warc file for scanning")
59+
os.Exit(0)
60+
}
61+
62+
f, e := os.Open(flag.Arg(0))
63+
if e != nil {
64+
fmt.Println(e)
65+
os.Exit(0)
66+
}
67+
68+
outBuf := &bytes.Buffer{}
69+
mainBuf := bufio.NewScanner(f)
70+
var overran bool
71+
72+
split := func(data []byte, atEOF bool) (advance int, token []byte, err error) {
73+
if atEOF && len(data) == 0 {
74+
return 0, nil, nil
75+
}
76+
if i := bytes.Index(data, marker); i >= 0 {
77+
overran = false
78+
return i + len(marker), data[0:i], nil
79+
}
80+
if atEOF {
81+
return len(data), data, nil
82+
}
83+
if len(data) > maxbuf {
84+
overran = true
85+
return len(data), data, nil
86+
}
87+
return 0, nil, nil
88+
}
89+
90+
mainBuf.Split(split)
91+
for mainBuf.Scan() {
92+
if overran {
93+
continue
94+
}
95+
for idx, v := range searchStrings {
96+
if i := bytes.Index(mainBuf.Bytes(), v); i >= 0 {
97+
if *all && idx < len(searchStrings)-1 {
98+
continue
99+
}
100+
_, err := outBuf.Write(marker)
101+
if err == nil {
102+
_, err = outBuf.Write(mainBuf.Bytes())
103+
}
104+
if err != nil {
105+
fmt.Println(err)
106+
os.Exit(0)
107+
}
108+
break
109+
}
110+
if *all {
111+
break
112+
}
113+
}
114+
}
115+
116+
ioutil.WriteFile(*out, outBuf.Bytes(), 0666)
117+
}

warc.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,27 @@ func (h *warcHeader) MIME() string {
6666
}
6767
}
6868

69+
func (h *warcHeader) transferEncodings() []string {
70+
vals := getSelectValues(h.fields, "Transfer-Encoding")
71+
if vals[0] == "" {
72+
return nil
73+
}
74+
return splitAndReverse(vals[0])
75+
}
76+
func (h *warcHeader) encodings() []string {
77+
vals := getSelectValues(h.fields, "Transfer-Encoding", "Content-Encoding")
78+
if vals[0] == "" {
79+
if vals[1] == "" {
80+
return nil
81+
}
82+
return splitAndReverse(vals[1])
83+
}
84+
if vals[1] == "" {
85+
return splitAndReverse(vals[0])
86+
}
87+
return append(splitAndReverse(vals[0]), splitAndReverse(vals[1])...)
88+
}
89+
6990
// Fields returns a map of all WARC fields for the current Record.
7091
// If NextPayload was used, this map will also contain any stripped HTTP headers.
7192
func (h *warcHeader) Fields() map[string][]string { return getAllValues(h.fields) }

webarchive.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ type Header interface {
4343
Date() time.Time
4444
MIME() string
4545
Fields() map[string][]string
46+
// private methods - used by DecodePayload
47+
transferEncodings() []string
48+
encodings() []string
4649
}
4750

4851
// Content represents the content portion of a WARC or ARC record.
@@ -51,6 +54,8 @@ type Content interface {
5154
Read(p []byte) (n int, err error)
5255
Slice(off int64, l int) ([]byte, error)
5356
EofSlice(off int64, l int) ([]byte, error)
57+
// private method -used by DecodePayload
58+
peek(i int) ([]byte, error)
5459
}
5560

5661
// Reader represents the common methods shared by ARC, WARC and Multi readers.

0 commit comments

Comments
 (0)