|
| 1 | +package com.rarchives.ripme.ripper.rippers; |
| 2 | + |
| 3 | +import java.io.IOException; |
| 4 | +import java.net.MalformedURLException; |
| 5 | +import java.net.URL; |
| 6 | +import java.util.regex.Matcher; |
| 7 | +import java.util.regex.Pattern; |
| 8 | + |
| 9 | +import org.apache.log4j.Logger; |
| 10 | +import org.json.JSONArray; |
| 11 | +import org.json.JSONObject; |
| 12 | +import org.jsoup.Jsoup; |
| 13 | +import org.jsoup.nodes.Document; |
| 14 | + |
| 15 | +import com.rarchives.ripme.ripper.AbstractRipper; |
| 16 | +import com.rarchives.ripme.utils.Utils; |
| 17 | + |
| 18 | +public class TumblrRipper extends AbstractRipper { |
| 19 | + |
| 20 | + private static final String DOMAIN = "tumblr.com", |
| 21 | + HOST = "tumblr"; |
| 22 | + private static final Logger logger = Logger.getLogger(TumblrRipper.class); |
| 23 | + |
| 24 | + private enum ALBUM_TYPE { |
| 25 | + SUBDOMAIN, |
| 26 | + TAG, |
| 27 | + POST |
| 28 | + } |
| 29 | + private ALBUM_TYPE albumType; |
| 30 | + private String subdomain, tagName, postNumber; |
| 31 | + |
| 32 | + private final String API_KEY; |
| 33 | + |
| 34 | + public TumblrRipper(URL url) throws IOException { |
| 35 | + super(url); |
| 36 | + API_KEY = Utils.getConfigString("tumblr.auth", null); |
| 37 | + if (API_KEY == null) { |
| 38 | + throw new IOException("Could not find tumblr authentication key in configuration"); |
| 39 | + } |
| 40 | + } |
| 41 | + |
| 42 | + @Override |
| 43 | + public boolean canRip(URL url) { |
| 44 | + return url.getHost().endsWith(DOMAIN); |
| 45 | + } |
| 46 | + |
| 47 | + @Override |
| 48 | + public URL sanitizeURL(URL url) throws MalformedURLException { |
| 49 | + return url; |
| 50 | + } |
| 51 | + |
| 52 | + @Override |
| 53 | + public void rip() throws IOException { |
| 54 | + String[] mediaTypes; |
| 55 | + if (albumType == ALBUM_TYPE.POST) { |
| 56 | + mediaTypes = new String[] { "post" }; |
| 57 | + } else { |
| 58 | + mediaTypes = new String[] { "photo", "video" }; |
| 59 | + } |
| 60 | + int offset; |
| 61 | + for (String mediaType : mediaTypes) { |
| 62 | + offset = 0; |
| 63 | + while (true) { |
| 64 | + String apiURL = getTumblrApiURL(mediaType, offset); |
| 65 | + logger.info(" Retrieving " + apiURL); |
| 66 | + Document doc = Jsoup.connect(apiURL) |
| 67 | + .ignoreContentType(true) |
| 68 | + .header("User-agent", USER_AGENT) |
| 69 | + .get(); |
| 70 | + String jsonString = doc.body().html().replaceAll(""", "\""); |
| 71 | + if (!handleJSON(jsonString)) { |
| 72 | + // Returns false if an error occurs and we should stop. |
| 73 | + break; |
| 74 | + } |
| 75 | + try { |
| 76 | + Thread.sleep(1000); |
| 77 | + } catch (InterruptedException e) { |
| 78 | + logger.error("[!] Exception while waiting to load next album:", e); |
| 79 | + break; |
| 80 | + } |
| 81 | + offset += 20; |
| 82 | + } |
| 83 | + } |
| 84 | + waitForThreads(); |
| 85 | + } |
| 86 | + |
| 87 | + private boolean handleJSON(String jsonString) { |
| 88 | + JSONObject json = new JSONObject(jsonString); |
| 89 | + if (json == null || !json.has("response")) { |
| 90 | + logger.error("[!] JSON response from tumblr was invalid: " + jsonString); |
| 91 | + return false; |
| 92 | + } |
| 93 | + JSONArray posts, photos; |
| 94 | + JSONObject post, photo; |
| 95 | + URL fileURL; |
| 96 | + |
| 97 | + posts = json.getJSONObject("response").getJSONArray("posts"); |
| 98 | + if (posts.length() == 0) { |
| 99 | + logger.info(" Zero posts returned. Dropping out."); |
| 100 | + return false; |
| 101 | + } |
| 102 | + |
| 103 | + for (int i = 0; i < posts.length(); i++) { |
| 104 | + post = posts.getJSONObject(i); |
| 105 | + if (post.has("photos")) { |
| 106 | + photos = post.getJSONArray("photos"); |
| 107 | + for (int j = 0; j < photos.length(); j++) { |
| 108 | + photo = photos.getJSONObject(j); |
| 109 | + try { |
| 110 | + fileURL = new URL(photo.getJSONObject("original_size").getString("url")); |
| 111 | + addURLToDownload(fileURL); |
| 112 | + } catch (Exception e) { |
| 113 | + logger.error("[!] Error while parsing photo in " + photo, e); |
| 114 | + continue; |
| 115 | + } |
| 116 | + } |
| 117 | + } else if (post.has("video_url")) { |
| 118 | + try { |
| 119 | + fileURL = new URL(post.getString("video_url")); |
| 120 | + addURLToDownload(fileURL); |
| 121 | + } catch (Exception e) { |
| 122 | + logger.error("[!] Error while parsing video in " + post, e); |
| 123 | + return true; |
| 124 | + } |
| 125 | + } |
| 126 | + if (albumType == ALBUM_TYPE.POST) { |
| 127 | + return false; |
| 128 | + } |
| 129 | + } |
| 130 | + return true; |
| 131 | + } |
| 132 | + |
| 133 | + private String getTumblrApiURL(String mediaType, int offset) { |
| 134 | + StringBuilder sb = new StringBuilder(); |
| 135 | + if (albumType == ALBUM_TYPE.POST) { |
| 136 | + sb.append("http://api.tumblr.com/v2/blog/") |
| 137 | + .append(subdomain) |
| 138 | + .append(".tumblr.com/posts?id=") |
| 139 | + .append(postNumber) |
| 140 | + .append("&api_key=") |
| 141 | + .append(API_KEY); |
| 142 | + return sb.toString(); |
| 143 | + } |
| 144 | + sb.append("http://api.tumblr.com/v2/blog/") |
| 145 | + .append(subdomain) |
| 146 | + .append(".tumblr.com/posts/") |
| 147 | + .append(mediaType) |
| 148 | + .append("?api_key=") |
| 149 | + .append(API_KEY) |
| 150 | + .append("&offset=") |
| 151 | + .append(offset); |
| 152 | + if (albumType == ALBUM_TYPE.TAG) { |
| 153 | + sb.append("&tag=") |
| 154 | + .append(tagName); |
| 155 | + } |
| 156 | + return sb.toString(); |
| 157 | + } |
| 158 | + |
| 159 | + @Override |
| 160 | + public String getHost() { |
| 161 | + return HOST; |
| 162 | + } |
| 163 | + |
| 164 | + @Override |
| 165 | + public String getGID(URL url) throws MalformedURLException { |
| 166 | + Pattern p; |
| 167 | + Matcher m; |
| 168 | + // Tagged URL |
| 169 | + p = Pattern.compile("^https?://([a-zA-Z0-9\\-]{1,})\\.tumblr\\.com/tagged/([a-zA-Z0-9\\-]{1,}).*$"); |
| 170 | + m = p.matcher(url.toExternalForm()); |
| 171 | + if (m.matches()) { |
| 172 | + this.albumType = ALBUM_TYPE.TAG; |
| 173 | + this.subdomain = m.group(1); |
| 174 | + this.tagName = m.group(2); |
| 175 | + this.tagName = this.tagName.replace('-', '+').replace("_", "%20"); |
| 176 | + return this.subdomain + "_tag_" + this.tagName; |
| 177 | + } |
| 178 | + // Post URL |
| 179 | + p = Pattern.compile("^https?://([a-zA-Z0-9\\-]{1,})\\.tumblr\\.com/post/([0-9]{1,}).*$"); |
| 180 | + m = p.matcher(url.toExternalForm()); |
| 181 | + if (m.matches()) { |
| 182 | + this.albumType = ALBUM_TYPE.POST; |
| 183 | + this.subdomain = m.group(1); |
| 184 | + this.postNumber = m.group(2); |
| 185 | + return this.subdomain + "_post_" + this.postNumber; |
| 186 | + } |
| 187 | + // Subdomain-level URL |
| 188 | + p = Pattern.compile("^https?://([a-zA-Z0-9\\-]{1,})\\.tumblr\\.com/?.*$"); |
| 189 | + m = p.matcher(url.toExternalForm()); |
| 190 | + if (m.matches()) { |
| 191 | + this.albumType = ALBUM_TYPE.SUBDOMAIN; |
| 192 | + this.subdomain = m.group(1); |
| 193 | + return this.subdomain; |
| 194 | + } |
| 195 | + // TODO support non-tumblr.com domains |
| 196 | + throw new MalformedURLException("Expected format: http://user.tumblr.com[/tagged/tag|/post/postno]"); |
| 197 | + } |
| 198 | + |
| 199 | +} |
0 commit comments