Skip to content

Commit

Permalink
Added instagram ripper, integration tests.
Browse files Browse the repository at this point in the history
Also fixed parts of the imgur ripper.
  • Loading branch information
4pr0n committed Mar 3, 2014
1 parent c5c5505 commit e2bb412
Show file tree
Hide file tree
Showing 7 changed files with 296 additions and 9 deletions.
10 changes: 8 additions & 2 deletions src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,6 @@ public void downloadErrored(URL url, String reason) {
}

private void checkIfComplete() {
System.err.println("Pending: " + itemsPending.size() + ", Completed: " + itemsCompleted.size() + ", Errored: " + itemsErrored.size());
if (!completed && itemsPending.size() == 0) {
completed = true;
logger.info("Rip completed!");
Expand All @@ -193,6 +192,10 @@ private void checkIfComplete() {
public URL getURL() {
return url;
}

public File getWorkingDir() {
return workingDir;
}

public void setWorkingDir(URL url) throws IOException {
String path = Utils.getWorkingDirectory().getCanonicalPath();
Expand Down Expand Up @@ -224,6 +227,7 @@ public static AbstractRipper getRipper(URL url) throws Exception {
return ripper;
} catch (Exception e) {
// Incompatible rippers *will* throw exceptions during instantiation.
logger.error("Excepion while instantiating: " + constructor.getClass().getName(), e);
}
}
throw new Exception("No compatible ripper found");
Expand All @@ -245,7 +249,9 @@ private static List<Constructor<?>> getRipperConstructors() throws Exception {
URL classURL = urls.nextElement();
for (File f : new File(classURL.toURI()).listFiles()) {
String className = f.getName();
if (!className.endsWith(".class") || className.contains("$")) {
if (!className.endsWith(".class")
|| className.contains("$")
|| className.endsWith("Test.class")) {
// Ignore non-class or nested classes.
continue;
}
Expand Down
25 changes: 19 additions & 6 deletions src/main/java/com/rarchives/ripme/ripper/rippers/ImgurRipper.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ public class ImgurRipper extends AbstractRipper {
private static final String DOMAIN = "imgur.com",
HOST = "imgur";
private static final Logger logger = Logger.getLogger(ImgurRipper.class);

private final int SLEEP_BETWEEN_ALBUMS;

static enum ALBUM_TYPE {
ALBUM,
USER,
Expand Down Expand Up @@ -61,6 +61,8 @@ public URL sanitizeURL(URL url) throws MalformedURLException {
if (u.indexOf('#') >= 0) {
u = u.substring(0, u.indexOf('#'));
}
u = u.replace("https?://m\\.imgur\\.com", "http://imgur.com");
u = u.replace("https?://i\\.imgur\\.com", "http://imgur.com");
return new URL(u);
}

Expand Down Expand Up @@ -204,14 +206,18 @@ public String getGID(URL url) throws MalformedURLException {
this.url = new URL("http://imgur.com/a/" + gid);
return gid;
}
p = Pattern.compile("^https?://([a-zA-Z0-9\\-]{1,})\\.imgur\\.com/?$");
p = Pattern.compile("^https?://([a-zA-Z0-9\\-]{3,})\\.imgur\\.com/?$");
m = p.matcher(url.toExternalForm());
if (m.matches()) {
// Root imgur account
String gid = m.group(1);
if (gid.equals("i")) {
throw new MalformedURLException("Ripping i.imgur.com links not supported");
}
albumType = ALBUM_TYPE.USER;
return m.group(1);
return gid;
}
p = Pattern.compile("^https?://([a-zA-Z0-9\\-])\\.imgur\\.com/([a-zA-Z0-9])?$");
p = Pattern.compile("^https?://([a-zA-Z0-9\\-]{3,})\\.imgur\\.com/([a-zA-Z0-9])?$");
m = p.matcher(url.toExternalForm());
if (m.matches()) {
// Imgur account album
Expand All @@ -223,9 +229,16 @@ public String getGID(URL url) throws MalformedURLException {
if (m.matches()) {
// Series of imgur images
albumType = ALBUM_TYPE.SERIES_OF_IMAGES;
return m.group(m.groupCount()).replaceAll(",", "-");
String gid = m.group(m.groupCount());
if (!gid.contains(",")) {
throw new MalformedURLException("Imgur image doesn't contain commas");
}
return gid.replaceAll(",", "-");
}
throw new MalformedURLException("Unexpected URL format: " + url.toExternalForm());
}

public ALBUM_TYPE getAlbumType() {
return albumType;
}
}
140 changes: 140 additions & 0 deletions src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
package com.rarchives.ripme.ripper.rippers;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.log4j.Logger;
import org.json.JSONArray;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import com.rarchives.ripme.ripper.AbstractRipper;

public class InstagramRipper extends AbstractRipper {

private static final String DOMAIN = "instagram.com",
HOST = "instagram";
private static final Logger logger = Logger.getLogger(ImagearnRipper.class);

public InstagramRipper(URL url) throws IOException {
super(url);
}

@Override
public boolean canRip(URL url) {
return url.getHost().endsWith(DOMAIN);
}

@Override
public URL sanitizeURL(URL url) throws MalformedURLException {
Pattern p = Pattern.compile("^https?://instagram\\.com/p/([a-zA-Z0-9]{1,}).*$");
Matcher m = p.matcher(url.toExternalForm());
if (m.matches()) {
// Link to photo, not the user account
try {
url = getUserPageFromImage(url);
} catch (Exception e) {
logger.error("[!] Failed to get user page from " + url, e);
throw new MalformedURLException("Failed to retrieve user page from " + url);
}
}
p = Pattern.compile("^.*instagram.com/([a-zA-Z0-9]{3,}).*$");
m = p.matcher(url.toExternalForm());
if (!m.matches()) {
throw new MalformedURLException("Expected username in URL (instagram.com/username and not " + url);
}
return new URL("http://statigr.am/" + m.group(1));
}

private URL getUserPageFromImage(URL url) throws IOException {
Document doc = Jsoup.connect(url.toExternalForm()).get();
for (Element element : doc.select("meta[property='og:description']")) {
String content = element.attr("content");
if (content.endsWith("'s photo on Instagram")) {
return new URL("http://statigr.am/" + content.substring(0, content.indexOf("'")));
}
}
throw new MalformedURLException("Expected username in URL (instagram.com/username and not " + url);
}

private String getUserID(URL url) throws IOException {
logger.info(" Retrieving " + url);
Document doc = Jsoup.connect(this.url.toExternalForm()).get();
for (Element element : doc.select("input[id=user_public]")) {
return element.attr("value");
}
throw new IOException("Unable to find userID at " + this.url);
}

@Override
public void rip() throws IOException {
int index = 0;
String userID = getUserID(this.url);
String baseURL = "http://statigr.am/controller_nl.php?action=getPhotoUserPublic&user_id=" + userID;
String params = "";
while (true) {
String url = baseURL + params;
logger.info(" Retrieving " + url);
String jsonString = Jsoup.connect(url).ignoreContentType(true).execute().body();
JSONObject json = new JSONObject(jsonString);
JSONArray datas = json.getJSONArray("data");
String nextMaxID = "";
if (datas.length() == 0) {
break;
}
for (int i = 0; i < datas.length(); i++) {
JSONObject data = (JSONObject) datas.get(i);
if (data.has("id")) {
nextMaxID = data.getString("id");
}
if (data.has("videos")) {
index += 1;
String video = data.getJSONObject("videos").getJSONObject("standard_resolution").getString("url");
addURLToDownload(new URL(video), String.format("%03d_", index));
} else if (data.has("images")) {
index += 1;
String image = data.getJSONObject("images").getJSONObject("standard_resolution").getString("url");
// addURLToDownload(new URL(image), String.format("%03d_", index));
addURLToDownload(new URL(image));
}
}
JSONObject pagination = json.getJSONObject("pagination");
if (nextMaxID.equals("")) {
if (!pagination.has("next_max_id")) {
break;
} else {
nextMaxID = pagination.getString("next_max_id");
}
}
params = "&max_id=" + nextMaxID;
try {
Thread.sleep(3000);
} catch (InterruptedException e) {
logger.error("[!] Interrupted while waiting to load next album:", e);
break;
}
}
waitForThreads();
}

@Override
public String getHost() {
return HOST;
}

@Override
public String getGID(URL url) throws MalformedURLException {
Pattern p = Pattern.compile("^https?://statigr.am/([a-zA-Z0-9]{3,}).*$");
Matcher m = p.matcher(url.toExternalForm());
if (m.matches()) {
return m.group(1);
}
throw new MalformedURLException("Unable to find user in " + url);
}

}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package com.rarchives.ripme;
package com.rarchives.ripme.tst;

import junit.framework.Test;
import junit.framework.TestCase;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
package com.rarchives.ripme.tst.ripper.rippers;

import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

import com.rarchives.ripme.ripper.rippers.ImgurRipper;

public class ImgurRipperTest extends RippersTest {

public void testImgurURLFailures() throws IOException {
List<URL> failURLs = new ArrayList<URL>();
// Imgur urls that should not work
failURLs.add(new URL("http://imgur.com"));
failURLs.add(new URL("http://imgur.com/"));
failURLs.add(new URL("http://i.imgur.com"));
failURLs.add(new URL("http://i.imgur.com/"));
failURLs.add(new URL("http://imgur.com/image"));
failURLs.add(new URL("http://imgur.com/image.jpg"));
failURLs.add(new URL("http://i.imgur.com/image.jpg"));
for (URL url : failURLs) {
try {
new ImgurRipper(url);
fail("Instantiated ripper for URL that should not work: " + url);
} catch (Exception e) {
// Expected
continue;
}
}
}

public void testImgurURLPasses() throws IOException {
List<URL> passURLs = new ArrayList<URL>();
// Imgur URLs that should work
passURLs.add(new URL("http://imgur.com/a/XPd4F"));
passURLs.add(new URL("http://imgur.com/a/XPd4F/"));
passURLs.add(new URL("http://imgur.com/a/WxG6f/all"));
passURLs.add(new URL("http://imgur.com/a/WxG6f/layout/vertical#0"));
passURLs.add(new URL("http://imgur.com/a/WxG6f/layout/horizontal#0"));
passURLs.add(new URL("http://imgur.com/a/WxG6f/layout/grid#0"));
passURLs.add(new URL("http://imgur.com/YOdjht3,x5VxH9G,5juXjJ2"));
passURLs.add(new URL("http://markedone911.imgur.com"));
passURLs.add(new URL("http://markedone911.imgur.com/"));

for (URL url : passURLs) {
try {
ImgurRipper ripper = new ImgurRipper(url);
assertTrue(ripper.canRip(url));
deleteDir(ripper.getWorkingDir());
} catch (Exception e) {
fail("Failed to instantiate ripper for " + url);
}
}
}

public void testImgurAlbums() throws IOException {
List<URL> contentURLs = new ArrayList<URL>();
// URLs that should return more than 1 image
contentURLs.add(new URL("http://imgur.com/a/hqJIu")); // Vertical layout
contentURLs.add(new URL("http://imgur.com/a/dS9OQ#0")); // Horizontal layout
contentURLs.add(new URL("http://imgur.com/a/YpsW9#0")); // Grid layout
contentURLs.add(new URL("http://imgur.com/a/WxG6f/layout/vertical#0"));
contentURLs.add(new URL("http://imgur.com/a/WxG6f/layout/horizontal#0"));
contentURLs.add(new URL("http://imgur.com/a/WxG6f/layout/grid#0"));
for (URL url : contentURLs) {
try {
ImgurRipper ripper = new ImgurRipper(url);
ripper.rip();
assert(ripper.getWorkingDir().listFiles().length > 1);
deleteDir(ripper.getWorkingDir());
} catch (Exception e) {
fail("Error while ripping URL " + url + ": " + e.getMessage());
}
}
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package com.rarchives.ripme.tst.ripper.rippers;

import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

import com.rarchives.ripme.ripper.rippers.InstagramRipper;


public class InstagramRipperTest extends RippersTest {

public void testInstagramAlbums() throws IOException {
List<URL> contentURLs = new ArrayList<URL>();
contentURLs.add(new URL("http://instagram.com/feelgoodincc#"));
for (URL url : contentURLs) {
try {
InstagramRipper ripper = new InstagramRipper(url);
ripper.rip();
assert(ripper.getWorkingDir().listFiles().length > 1);
deleteDir(ripper.getWorkingDir());
} catch (Exception e) {
fail("Error while ripping URL " + url + ": " + e.getMessage());
}
}
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package com.rarchives.ripme.tst.ripper.rippers;

import java.io.File;

import junit.framework.TestCase;

public class RippersTest extends TestCase {

protected void deleteDir(File dir) {
return;
/*
for (File f : dir.listFiles()) {
if (f.isDirectory()) {
deleteDir(f);
}
f.delete();
}
dir.delete();
//*/
}

}

0 comments on commit e2bb412

Please sign in to comment.