Skip to content

Commit c288afd

Browse files
committed
Tumblr support
1 parent 034dc1d commit c288afd

4 files changed

Lines changed: 234 additions & 0 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@
22
.DS_Store
33
ripme.log
44
rips/
5+
.history
Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
package com.rarchives.ripme.ripper.rippers;
2+
3+
import java.io.IOException;
4+
import java.net.MalformedURLException;
5+
import java.net.URL;
6+
import java.util.regex.Matcher;
7+
import java.util.regex.Pattern;
8+
9+
import org.apache.log4j.Logger;
10+
import org.json.JSONArray;
11+
import org.json.JSONObject;
12+
import org.jsoup.Jsoup;
13+
import org.jsoup.nodes.Document;
14+
15+
import com.rarchives.ripme.ripper.AbstractRipper;
16+
import com.rarchives.ripme.utils.Utils;
17+
18+
public class TumblrRipper extends AbstractRipper {
19+
20+
private static final String DOMAIN = "tumblr.com",
21+
HOST = "tumblr";
22+
private static final Logger logger = Logger.getLogger(TumblrRipper.class);
23+
24+
private enum ALBUM_TYPE {
25+
SUBDOMAIN,
26+
TAG,
27+
POST
28+
}
29+
private ALBUM_TYPE albumType;
30+
private String subdomain, tagName, postNumber;
31+
32+
private final String API_KEY;
33+
34+
public TumblrRipper(URL url) throws IOException {
35+
super(url);
36+
API_KEY = Utils.getConfigString("tumblr.auth", null);
37+
if (API_KEY == null) {
38+
throw new IOException("Could not find tumblr authentication key in configuration");
39+
}
40+
}
41+
42+
@Override
43+
public boolean canRip(URL url) {
44+
return url.getHost().endsWith(DOMAIN);
45+
}
46+
47+
@Override
48+
public URL sanitizeURL(URL url) throws MalformedURLException {
49+
return url;
50+
}
51+
52+
@Override
53+
public void rip() throws IOException {
54+
String[] mediaTypes;
55+
if (albumType == ALBUM_TYPE.POST) {
56+
mediaTypes = new String[] { "post" };
57+
} else {
58+
mediaTypes = new String[] { "photo", "video" };
59+
}
60+
int offset;
61+
for (String mediaType : mediaTypes) {
62+
offset = 0;
63+
while (true) {
64+
String apiURL = getTumblrApiURL(mediaType, offset);
65+
logger.info(" Retrieving " + apiURL);
66+
Document doc = Jsoup.connect(apiURL)
67+
.ignoreContentType(true)
68+
.header("User-agent", USER_AGENT)
69+
.get();
70+
String jsonString = doc.body().html().replaceAll(""", "\"");
71+
if (!handleJSON(jsonString)) {
72+
// Returns false if an error occurs and we should stop.
73+
break;
74+
}
75+
try {
76+
Thread.sleep(1000);
77+
} catch (InterruptedException e) {
78+
logger.error("[!] Exception while waiting to load next album:", e);
79+
break;
80+
}
81+
offset += 20;
82+
}
83+
}
84+
waitForThreads();
85+
}
86+
87+
private boolean handleJSON(String jsonString) {
88+
JSONObject json = new JSONObject(jsonString);
89+
if (json == null || !json.has("response")) {
90+
logger.error("[!] JSON response from tumblr was invalid: " + jsonString);
91+
return false;
92+
}
93+
JSONArray posts, photos;
94+
JSONObject post, photo;
95+
URL fileURL;
96+
97+
posts = json.getJSONObject("response").getJSONArray("posts");
98+
if (posts.length() == 0) {
99+
logger.info(" Zero posts returned. Dropping out.");
100+
return false;
101+
}
102+
103+
for (int i = 0; i < posts.length(); i++) {
104+
post = posts.getJSONObject(i);
105+
if (post.has("photos")) {
106+
photos = post.getJSONArray("photos");
107+
for (int j = 0; j < photos.length(); j++) {
108+
photo = photos.getJSONObject(j);
109+
try {
110+
fileURL = new URL(photo.getJSONObject("original_size").getString("url"));
111+
addURLToDownload(fileURL);
112+
} catch (Exception e) {
113+
logger.error("[!] Error while parsing photo in " + photo, e);
114+
continue;
115+
}
116+
}
117+
} else if (post.has("video_url")) {
118+
try {
119+
fileURL = new URL(post.getString("video_url"));
120+
addURLToDownload(fileURL);
121+
} catch (Exception e) {
122+
logger.error("[!] Error while parsing video in " + post, e);
123+
return true;
124+
}
125+
}
126+
if (albumType == ALBUM_TYPE.POST) {
127+
return false;
128+
}
129+
}
130+
return true;
131+
}
132+
133+
private String getTumblrApiURL(String mediaType, int offset) {
134+
StringBuilder sb = new StringBuilder();
135+
if (albumType == ALBUM_TYPE.POST) {
136+
sb.append("http://api.tumblr.com/v2/blog/")
137+
.append(subdomain)
138+
.append(".tumblr.com/posts?id=")
139+
.append(postNumber)
140+
.append("&api_key=")
141+
.append(API_KEY);
142+
return sb.toString();
143+
}
144+
sb.append("http://api.tumblr.com/v2/blog/")
145+
.append(subdomain)
146+
.append(".tumblr.com/posts/")
147+
.append(mediaType)
148+
.append("?api_key=")
149+
.append(API_KEY)
150+
.append("&offset=")
151+
.append(offset);
152+
if (albumType == ALBUM_TYPE.TAG) {
153+
sb.append("&tag=")
154+
.append(tagName);
155+
}
156+
return sb.toString();
157+
}
158+
159+
@Override
160+
public String getHost() {
161+
return HOST;
162+
}
163+
164+
@Override
165+
public String getGID(URL url) throws MalformedURLException {
166+
Pattern p;
167+
Matcher m;
168+
// Tagged URL
169+
p = Pattern.compile("^https?://([a-zA-Z0-9\\-]{1,})\\.tumblr\\.com/tagged/([a-zA-Z0-9\\-]{1,}).*$");
170+
m = p.matcher(url.toExternalForm());
171+
if (m.matches()) {
172+
this.albumType = ALBUM_TYPE.TAG;
173+
this.subdomain = m.group(1);
174+
this.tagName = m.group(2);
175+
this.tagName = this.tagName.replace('-', '+').replace("_", "%20");
176+
return this.subdomain + "_tag_" + this.tagName;
177+
}
178+
// Post URL
179+
p = Pattern.compile("^https?://([a-zA-Z0-9\\-]{1,})\\.tumblr\\.com/post/([0-9]{1,}).*$");
180+
m = p.matcher(url.toExternalForm());
181+
if (m.matches()) {
182+
this.albumType = ALBUM_TYPE.POST;
183+
this.subdomain = m.group(1);
184+
this.postNumber = m.group(2);
185+
return this.subdomain + "_post_" + this.postNumber;
186+
}
187+
// Subdomain-level URL
188+
p = Pattern.compile("^https?://([a-zA-Z0-9\\-]{1,})\\.tumblr\\.com/?.*$");
189+
m = p.matcher(url.toExternalForm());
190+
if (m.matches()) {
191+
this.albumType = ALBUM_TYPE.SUBDOMAIN;
192+
this.subdomain = m.group(1);
193+
return this.subdomain;
194+
}
195+
// TODO support non-tumblr.com domains
196+
throw new MalformedURLException("Expected format: http://user.tumblr.com[/tagged/tag|/post/postno]");
197+
}
198+
199+
}

src/main/resources/rip.properties

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@ threads.size = 5
22
file.overwrite = false
33
download.retries = 3
44
twitter.auth = VW9Ybjdjb1pkd2J0U3kwTUh2VXVnOm9GTzVQVzNqM29LQU1xVGhnS3pFZzhKbGVqbXU0c2lHQ3JrUFNNZm8=
5+
tumblr.auth = v5kUqGQXUtmF7K0itri1DGtgTs0VQpbSEbh1jxYgj9d2Sq18F8
56
gw.api = gonewild
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
package com.rarchives.ripme.tst.ripper.rippers;
2+
3+
import java.io.IOException;
4+
import java.net.URL;
5+
import java.util.ArrayList;
6+
import java.util.List;
7+
8+
import com.rarchives.ripme.ripper.rippers.TumblrRipper;
9+
10+
public class TumblrRipperTest extends RippersTest {
11+
12+
public void testTumblrAlbums() throws IOException {
13+
if (false && !DOWNLOAD_CONTENT) {
14+
return;
15+
}
16+
List<URL> contentURLs = new ArrayList<URL>();
17+
contentURLs.add(new URL("http://wrouinr.tumblr.com/archive"));
18+
contentURLs.add(new URL("http://topinstagirls.tumblr.com/tagged/berlinskaya"));
19+
contentURLs.add(new URL("http://fittingroomgirls.tumblr.com/post/78268776776"));
20+
for (URL url : contentURLs) {
21+
try {
22+
TumblrRipper ripper = new TumblrRipper(url);
23+
ripper.rip();
24+
assert(ripper.getWorkingDir().listFiles().length > 1);
25+
deleteDir(ripper.getWorkingDir());
26+
} catch (Exception e) {
27+
e.printStackTrace();
28+
fail("Error while ripping URL " + url + ": " + e.getMessage());
29+
}
30+
}
31+
}
32+
33+
}

0 commit comments

Comments
 (0)