@@ -3,13 +3,12 @@ package instagram
33import (
44 "encoding/json"
55 netURL "net/url"
6- "path"
76 "strings"
87
98 "github.com/pkg/errors"
9+ "golang.org/x/net/html"
1010
1111 "github.com/iawia002/lux/extractors"
12- "github.com/iawia002/lux/parser"
1312 "github.com/iawia002/lux/request"
1413 "github.com/iawia002/lux/utils"
1514)
@@ -18,18 +17,30 @@ func init() {
1817 extractors .Register ("instagram" , New ())
1918}
2019
21- type instagram struct {
22- ShortcodeMedia struct {
23- EdgeSidecar struct {
24- Edges []struct {
25- Node struct {
26- DisplayURL string `json:"display_url"`
27- IsVideo bool `json:"is_video"`
28- VideoURL string `json:"video_url"`
29- } `json:"node"`
30- } `json:"edges"`
31- } `json:"edge_sidecar_to_children"`
32- } `json:"shortcode_media"`
20+ type instagramPayload struct {
21+ ArticleBody string `json:"articleBody"`
22+ Author struct {
23+ Image string `json:"image"`
24+ Name string `json:"name"`
25+ AlternativeName string `json:"alternativeName"`
26+ Url string `json:"url"`
27+ } `json:"author"`
28+ Videos []struct {
29+ UploadData string `json:"string"`
30+ Description string `json:"description"`
31+ Name string `json:"name"`
32+ Caption string `json:"caption"`
33+ Height string `json:"height"`
34+ Width string `json:"width"`
35+ ContentURL string `json:"contentUrl"`
36+ ThumbnailURL string `json:"thumbnailUrl"`
37+ } `json:"video"`
38+ Images []struct {
39+ Caption string `json:"caption"`
40+ Height string `json:"height"`
41+ Width string `json:"width"`
42+ URL string `json:"url"`
43+ } `json:"image"`
3344}
3445
3546type extractor struct {}
@@ -39,104 +50,65 @@ func New() extractors.Extractor {
3950 return & extractor {}
4051}
4152
42- func extractImageFromPage (html , url string ) (map [string ]* extractors.Stream , error ) {
43- _ , realURLs , err := parser .GetImages (html , "EmbeddedMediaImage" , nil )
53+ // Extract is the main function to extract the data.
54+ func (e * extractor ) Extract (url string , option extractors.Options ) ([]* extractors.Data , error ) {
55+ u , err := netURL .Parse (url )
4456 if err != nil {
4557 return nil , errors .WithStack (err )
4658 }
4759
48- urls := make ([]* extractors.Part , 0 , len (realURLs ))
49- var totalSize int64
50- for _ , realURL := range realURLs {
51- size , err := request .Size (realURL , url )
52- if err != nil {
53- return nil , errors .WithStack (err )
54- }
55- urlData := & extractors.Part {
56- URL : realURL ,
57- Size : size ,
58- Ext : "jpg" ,
59- }
60- urls = append (urls , urlData )
61- totalSize += size
60+ htmlResp , err := request .Get (u .String (), url , nil )
61+ if err != nil {
62+ return nil , errors .WithStack (err )
6263 }
6364
64- return map [string ]* extractors.Stream {
65- "default" : {
66- Parts : urls ,
67- Size : totalSize ,
68- },
69- }, nil
70- }
65+ reader := strings .NewReader (htmlResp )
66+ htmlRoot , err := html .Parse (reader )
67+ if err != nil {
68+ return nil , errors .WithStack (err )
69+ }
7170
72- func extractFromData (dataString , url string ) (map [string ]* extractors.Stream , error ) {
73- var data instagram
74- if err := json .Unmarshal ([]byte (dataString ), & data ); err != nil {
71+ sNode , err := dfsFindScript (htmlRoot )
72+ if err != nil {
73+ return nil , errors .WithStack (err )
74+ }
75+
76+ var payload instagramPayload
77+ if err = json .Unmarshal ([]byte (sNode .Data ), & payload ); err != nil {
7578 return nil , errors .WithStack (err )
7679 }
7780
78- urls := make ([]* extractors.Part , 0 , len (data .ShortcodeMedia .EdgeSidecar .Edges ))
7981 var totalSize int64
80- for _ , u := range data .ShortcodeMedia .EdgeSidecar .Edges {
81- // Image
82- realURL := u .Node .DisplayURL
83- ext := "jpg"
84- if u .Node .IsVideo {
85- // Video
86- realURL = u .Node .VideoURL
87- ext = "mp4"
82+ var parts []* extractors.Part
83+ if len (payload .Videos ) > 0 {
84+ videoParts , err := createPartVideos (& payload , url )
85+ if err != nil {
86+ return nil , errors .WithStack (extractors .ErrBodyParseFailed )
8887 }
8988
90- size , err := request .Size (realURL , url )
89+ parts = append (parts , videoParts ... )
90+ }
91+ if len (payload .Images ) > 0 {
92+ imageParts , err := createPartImages (& payload , url )
9193 if err != nil {
92- return nil , errors .WithStack (err )
93- }
94- urlData := & extractors.Part {
95- URL : realURL ,
96- Size : size ,
97- Ext : ext ,
94+ return nil , errors .WithStack (extractors .ErrBodyParseFailed )
9895 }
99- urls = append (urls , urlData )
100- totalSize += size
96+
97+ parts = append (parts , imageParts ... )
98+ }
99+
100+ for _ , part := range parts {
101+ totalSize += part .Size
101102 }
102103
103- return map [string ]* extractors.Stream {
104+ streams := map [string ]* extractors.Stream {
104105 "default" : {
105- Parts : urls ,
106+ Parts : parts ,
106107 Size : totalSize ,
107108 },
108- }, nil
109- }
110-
111- // Extract is the main function to extract the data.
112- func (e * extractor ) Extract (url string , option extractors.Options ) ([]* extractors.Data , error ) {
113- // Instagram is forcing a login to access the page, so we use the embed page to bypass that.
114- u , err := netURL .Parse (url )
115- if err != nil {
116- return nil , errors .WithStack (err )
117109 }
118- id := u .Path [strings .LastIndex (u .Path , "/" )+ 1 :]
119- u .Path = path .Join (u .Path , "embed" )
120110
121- html , err := request .Get (u .String (), url , nil )
122- if err != nil {
123- return nil , errors .WithStack (err )
124- }
125- dataStrings := utils .MatchOneOf (html , `window\.__additionalDataLoaded\('graphql',(.*)\);` )
126- if dataStrings == nil || len (dataStrings ) < 2 {
127- return nil , errors .WithStack (extractors .ErrURLParseFailed )
128- }
129- dataString := dataStrings [1 ]
130-
131- var streams map [string ]* extractors.Stream
132- if dataString == "" || dataString == "null" {
133- streams , err = extractImageFromPage (html , url )
134- } else {
135- streams , err = extractFromData (dataString , url )
136- }
137- if err != nil {
138- return nil , errors .WithStack (err )
139- }
111+ id := u .Path [strings .LastIndex (u .Path , "/" )+ 1 :]
140112
141113 return []* extractors.Data {
142114 {
@@ -148,3 +120,65 @@ func (e *extractor) Extract(url string, option extractors.Options) ([]*extractor
148120 },
149121 }, nil
150122}
123+
124+ func dfsFindScript (n * html.Node ) (* html.Node , error ) {
125+ if n .Type == html .ElementNode && n .Data == "script" {
126+ for _ , attr := range n .Attr {
127+ if attr .Key == "type" && attr .Val == "application/ld+json" {
128+ return n .FirstChild , nil
129+ }
130+ }
131+ }
132+
133+ for c := n .FirstChild ; c != nil ; c = c .NextSibling {
134+ if ret , err := dfsFindScript (c ); err == nil {
135+ return ret , nil
136+ }
137+ }
138+
139+ return nil , errors .WithStack (extractors .ErrBodyParseFailed )
140+ }
141+
142+ func createPartVideos (payload * instagramPayload , ref string ) (parts []* extractors.Part , err error ) {
143+ for _ , it := range payload .Videos {
144+ _ , ext , err := utils .GetNameAndExt (it .ContentURL )
145+ if err != nil {
146+ return parts , errors .WithStack (err )
147+ }
148+ filesize , err := request .Size (it .ContentURL , ref )
149+ if err != nil {
150+ return parts , errors .WithStack (err )
151+ }
152+
153+ part := & extractors.Part {
154+ URL : it .ContentURL ,
155+ Size : filesize ,
156+ Ext : ext ,
157+ }
158+ parts = append (parts , part )
159+ }
160+
161+ return parts , err
162+ }
163+
164+ func createPartImages (payload * instagramPayload , ref string ) (parts []* extractors.Part , err error ) {
165+ for _ , it := range payload .Images {
166+ _ , ext , err := utils .GetNameAndExt (it .URL )
167+ if err != nil {
168+ return parts , errors .WithStack (err )
169+ }
170+ filesize , err := request .Size (it .URL , ref )
171+ if err != nil {
172+ return parts , errors .WithStack (err )
173+ }
174+
175+ part := & extractors.Part {
176+ URL : it .URL ,
177+ Size : filesize ,
178+ Ext : ext ,
179+ }
180+ parts = append (parts , part )
181+ }
182+
183+ return parts , err
184+ }
0 commit comments