Skip to content

Commit 99f8093

Browse files
shavitiawia002
andauthored
1228 fix instagram download (#1252)
* Extract Instagram payload Read the payload of stories and images * Extract images and videos * Update the file sizes in Instagram tests * go mod tidy --------- Co-authored-by: Xinzhao Xu <[email protected]>
1 parent 3da4af3 commit 99f8093

4 files changed

Lines changed: 129 additions & 94 deletions

File tree

extractors/errors.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,5 @@ var (
99
ErrURLParseFailed = errors.New("url parse failed")
1010
ErrInvalidRegularExpression = errors.New("invalid regular expression")
1111
ErrURLQueryParamsParseFailed = errors.New("url query params parse failed")
12+
ErrBodyParseFailed = errors.New("body parse failed")
1213
)

extractors/instagram/instagram.go

Lines changed: 124 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,12 @@ package instagram
33
import (
44
"encoding/json"
55
netURL "net/url"
6-
"path"
76
"strings"
87

98
"github.com/pkg/errors"
9+
"golang.org/x/net/html"
1010

1111
"github.com/iawia002/lux/extractors"
12-
"github.com/iawia002/lux/parser"
1312
"github.com/iawia002/lux/request"
1413
"github.com/iawia002/lux/utils"
1514
)
@@ -18,18 +17,30 @@ func init() {
1817
extractors.Register("instagram", New())
1918
}
2019

21-
type instagram struct {
22-
ShortcodeMedia struct {
23-
EdgeSidecar struct {
24-
Edges []struct {
25-
Node struct {
26-
DisplayURL string `json:"display_url"`
27-
IsVideo bool `json:"is_video"`
28-
VideoURL string `json:"video_url"`
29-
} `json:"node"`
30-
} `json:"edges"`
31-
} `json:"edge_sidecar_to_children"`
32-
} `json:"shortcode_media"`
20+
type instagramPayload struct {
21+
ArticleBody string `json:"articleBody"`
22+
Author struct {
23+
Image string `json:"image"`
24+
Name string `json:"name"`
25+
AlternativeName string `json:"alternativeName"`
26+
Url string `json:"url"`
27+
} `json:"author"`
28+
Videos []struct {
29+
UploadData string `json:"string"`
30+
Description string `json:"description"`
31+
Name string `json:"name"`
32+
Caption string `json:"caption"`
33+
Height string `json:"height"`
34+
Width string `json:"width"`
35+
ContentURL string `json:"contentUrl"`
36+
ThumbnailURL string `json:"thumbnailUrl"`
37+
} `json:"video"`
38+
Images []struct {
39+
Caption string `json:"caption"`
40+
Height string `json:"height"`
41+
Width string `json:"width"`
42+
URL string `json:"url"`
43+
} `json:"image"`
3344
}
3445

3546
type extractor struct{}
@@ -39,104 +50,65 @@ func New() extractors.Extractor {
3950
return &extractor{}
4051
}
4152

42-
func extractImageFromPage(html, url string) (map[string]*extractors.Stream, error) {
43-
_, realURLs, err := parser.GetImages(html, "EmbeddedMediaImage", nil)
53+
// Extract is the main function to extract the data.
54+
func (e *extractor) Extract(url string, option extractors.Options) ([]*extractors.Data, error) {
55+
u, err := netURL.Parse(url)
4456
if err != nil {
4557
return nil, errors.WithStack(err)
4658
}
4759

48-
urls := make([]*extractors.Part, 0, len(realURLs))
49-
var totalSize int64
50-
for _, realURL := range realURLs {
51-
size, err := request.Size(realURL, url)
52-
if err != nil {
53-
return nil, errors.WithStack(err)
54-
}
55-
urlData := &extractors.Part{
56-
URL: realURL,
57-
Size: size,
58-
Ext: "jpg",
59-
}
60-
urls = append(urls, urlData)
61-
totalSize += size
60+
htmlResp, err := request.Get(u.String(), url, nil)
61+
if err != nil {
62+
return nil, errors.WithStack(err)
6263
}
6364

64-
return map[string]*extractors.Stream{
65-
"default": {
66-
Parts: urls,
67-
Size: totalSize,
68-
},
69-
}, nil
70-
}
65+
reader := strings.NewReader(htmlResp)
66+
htmlRoot, err := html.Parse(reader)
67+
if err != nil {
68+
return nil, errors.WithStack(err)
69+
}
7170

72-
func extractFromData(dataString, url string) (map[string]*extractors.Stream, error) {
73-
var data instagram
74-
if err := json.Unmarshal([]byte(dataString), &data); err != nil {
71+
sNode, err := dfsFindScript(htmlRoot)
72+
if err != nil {
73+
return nil, errors.WithStack(err)
74+
}
75+
76+
var payload instagramPayload
77+
if err = json.Unmarshal([]byte(sNode.Data), &payload); err != nil {
7578
return nil, errors.WithStack(err)
7679
}
7780

78-
urls := make([]*extractors.Part, 0, len(data.ShortcodeMedia.EdgeSidecar.Edges))
7981
var totalSize int64
80-
for _, u := range data.ShortcodeMedia.EdgeSidecar.Edges {
81-
// Image
82-
realURL := u.Node.DisplayURL
83-
ext := "jpg"
84-
if u.Node.IsVideo {
85-
// Video
86-
realURL = u.Node.VideoURL
87-
ext = "mp4"
82+
var parts []*extractors.Part
83+
if len(payload.Videos) > 0 {
84+
videoParts, err := createPartVideos(&payload, url)
85+
if err != nil {
86+
return nil, errors.WithStack(extractors.ErrBodyParseFailed)
8887
}
8988

90-
size, err := request.Size(realURL, url)
89+
parts = append(parts, videoParts...)
90+
}
91+
if len(payload.Images) > 0 {
92+
imageParts, err := createPartImages(&payload, url)
9193
if err != nil {
92-
return nil, errors.WithStack(err)
93-
}
94-
urlData := &extractors.Part{
95-
URL: realURL,
96-
Size: size,
97-
Ext: ext,
94+
return nil, errors.WithStack(extractors.ErrBodyParseFailed)
9895
}
99-
urls = append(urls, urlData)
100-
totalSize += size
96+
97+
parts = append(parts, imageParts...)
98+
}
99+
100+
for _, part := range parts {
101+
totalSize += part.Size
101102
}
102103

103-
return map[string]*extractors.Stream{
104+
streams := map[string]*extractors.Stream{
104105
"default": {
105-
Parts: urls,
106+
Parts: parts,
106107
Size: totalSize,
107108
},
108-
}, nil
109-
}
110-
111-
// Extract is the main function to extract the data.
112-
func (e *extractor) Extract(url string, option extractors.Options) ([]*extractors.Data, error) {
113-
// Instagram is forcing a login to access the page, so we use the embed page to bypass that.
114-
u, err := netURL.Parse(url)
115-
if err != nil {
116-
return nil, errors.WithStack(err)
117109
}
118-
id := u.Path[strings.LastIndex(u.Path, "/")+1:]
119-
u.Path = path.Join(u.Path, "embed")
120110

121-
html, err := request.Get(u.String(), url, nil)
122-
if err != nil {
123-
return nil, errors.WithStack(err)
124-
}
125-
dataStrings := utils.MatchOneOf(html, `window\.__additionalDataLoaded\('graphql',(.*)\);`)
126-
if dataStrings == nil || len(dataStrings) < 2 {
127-
return nil, errors.WithStack(extractors.ErrURLParseFailed)
128-
}
129-
dataString := dataStrings[1]
130-
131-
var streams map[string]*extractors.Stream
132-
if dataString == "" || dataString == "null" {
133-
streams, err = extractImageFromPage(html, url)
134-
} else {
135-
streams, err = extractFromData(dataString, url)
136-
}
137-
if err != nil {
138-
return nil, errors.WithStack(err)
139-
}
111+
id := u.Path[strings.LastIndex(u.Path, "/")+1:]
140112

141113
return []*extractors.Data{
142114
{
@@ -148,3 +120,65 @@ func (e *extractor) Extract(url string, option extractors.Options) ([]*extractor
148120
},
149121
}, nil
150122
}
123+
124+
func dfsFindScript(n *html.Node) (*html.Node, error) {
125+
if n.Type == html.ElementNode && n.Data == "script" {
126+
for _, attr := range n.Attr {
127+
if attr.Key == "type" && attr.Val == "application/ld+json" {
128+
return n.FirstChild, nil
129+
}
130+
}
131+
}
132+
133+
for c := n.FirstChild; c != nil; c = c.NextSibling {
134+
if ret, err := dfsFindScript(c); err == nil {
135+
return ret, nil
136+
}
137+
}
138+
139+
return nil, errors.WithStack(extractors.ErrBodyParseFailed)
140+
}
141+
142+
func createPartVideos(payload *instagramPayload, ref string) (parts []*extractors.Part, err error) {
143+
for _, it := range payload.Videos {
144+
_, ext, err := utils.GetNameAndExt(it.ContentURL)
145+
if err != nil {
146+
return parts, errors.WithStack(err)
147+
}
148+
filesize, err := request.Size(it.ContentURL, ref)
149+
if err != nil {
150+
return parts, errors.WithStack(err)
151+
}
152+
153+
part := &extractors.Part{
154+
URL: it.ContentURL,
155+
Size: filesize,
156+
Ext: ext,
157+
}
158+
parts = append(parts, part)
159+
}
160+
161+
return parts, err
162+
}
163+
164+
func createPartImages(payload *instagramPayload, ref string) (parts []*extractors.Part, err error) {
165+
for _, it := range payload.Images {
166+
_, ext, err := utils.GetNameAndExt(it.URL)
167+
if err != nil {
168+
return parts, errors.WithStack(err)
169+
}
170+
filesize, err := request.Size(it.URL, ref)
171+
if err != nil {
172+
return parts, errors.WithStack(err)
173+
}
174+
175+
part := &extractors.Part{
176+
URL: it.URL,
177+
Size: filesize,
178+
Ext: ext,
179+
}
180+
parts = append(parts, part)
181+
}
182+
183+
return parts, err
184+
}

extractors/instagram/instagram_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,23 +17,23 @@ func TestDownload(t *testing.T) {
1717
args: test.Args{
1818
URL: "https://www.instagram.com/p/BlIka1ZFCNr",
1919
Title: "Instagram BlIka1ZFCNr",
20-
Size: 3003662,
20+
Size: 577298,
2121
},
2222
},
2323
{
2424
name: "image test",
2525
args: test.Args{
2626
URL: "https://www.instagram.com/p/Bl5oVUyl9Yx",
2727
Title: "Instagram Bl5oVUyl9Yx",
28-
Size: 250596,
28+
Size: 101611,
2929
},
3030
},
3131
{
3232
name: "image album test",
3333
args: test.Args{
3434
URL: "https://www.instagram.com/p/Bjyr-gxF4Rb",
3535
Title: "Instagram Bjyr-gxF4Rb",
36-
Size: 4599909,
36+
Size: 241466,
3737
},
3838
},
3939
}

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ require (
1616
github.com/pkg/errors v0.9.1
1717
github.com/robertkrimen/otto v0.0.0-20211024170158-b87d35c0b86f
1818
github.com/urfave/cli/v2 v2.6.0
19+
golang.org/x/net v0.7.0
1920
)
2021

2122
require (
@@ -37,7 +38,6 @@ require (
3738
github.com/rogpeppe/go-internal v1.9.0 // indirect
3839
github.com/russross/blackfriday/v2 v2.1.0 // indirect
3940
golang.org/x/exp v0.0.0-20220518171630-0b5c67f07fdf // indirect
40-
golang.org/x/net v0.7.0 // indirect
4141
golang.org/x/sys v0.5.0 // indirect
4242
golang.org/x/text v0.7.0 // indirect
4343
gopkg.in/sourcemap.v1 v1.0.5 // indirect

0 commit comments

Comments
 (0)