Add captcha solving support. - Fb2RSS - A Facebook to RSS conversion tool

commit 9804aec4177eb906990bcf69b0a0e05204a48356
parent 03f9bf88d25ca86215fa40ec222d76b76f11c3ca
Author: Dominik Schmidt <das1993@hotmail.com>
Date:   Sat, 23 Jan 2016 00:02:32 +0100

Add captcha solving support.

Facebook thought it a nifty "security" feature, to add
"security check"-captcha to their pages.

Whilst this has nothing to do with "security", it effectively renders
my attempts at fetching the data useless.

So here's what to do:
1) Add cookie storage to the request
2) Solve a captcha once
3) ???
4) Non-Profit

Which is what was implemented in this commit.
You'll have to execute the ./captcha-Binary once, and interpret the string
inside the image, and then it should continue to work normally.

For compatibility-reasons I included the standardpaths-library, which handles
Paths quite nicely.

Diffstat:
.gitmodules  | 3 +++
Makefile  | 8 ++++++--
captcha.d  | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
fbstream.d  | 45 ++++++++++++++++++++++++++++++++++++++++++++-
standardpaths  | 1 +

5 files changed, 141 insertions(+), 3 deletions(-)
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "DRSS"]
 	path = DRSS
 	url = https://github.com/Doeme/DRSS.git
+[submodule "standardpaths"]
+	path = standardpaths
+	url = https://github.com/MyLittleRobo/standardpaths.git
diff --git a/Makefile b/Makefile
@@ -1,9 +1,13 @@
 DMD?=ldmd2
 OPTS?=-release -O
-IOPTS=$(OPTS) -IDRSS/ -IDRSS/kxml/source/
+IOPTS=$(OPTS) -IDRSS/ -IDRSS/kxml/source/ -Istandardpaths/source/
 
-Fb2RSS: fbstream.o Fb2RSS.o DRSS/drss.a
+Fb2RSS: fbstream.o Fb2RSS.o DRSS/drss.a standardpaths/libstandardpaths.a
 	$(DMD) $(IOPTS) $^ -of$@
+captcha: captcha.o fbstream.o DRSS/drss.a standardpaths/libstandardpaths.a
+	$(DMD) $(IOPTS) $^ -of$@
+standardpaths/libstandardpaths.a: standardpaths/source/standardpaths.o
+	$(DMD) $(IOPTS) -lib $^ -of$@
 %.o: %.d
 	$(DMD) $(IOPTS) -c $< -of$@
 .PHONY:
diff --git a/captcha.d b/captcha.d
@@ -0,0 +1,87 @@
+import fbstream;
+import std.net.curl;
+import std.stdio;
+import std.file;
+import std.path;
+import std.regex;
+import std.format;
+import std.range;
+import std.algorithm.searching;
+
+
+/**
+ * Tries to fetch the captcha and set the cookies
+ * 
+ * Returns: 0 if the captcha is solved, 1 otherwise.
+ */
+int main(string[] args){
+	auto h=HTTP();
+	char[] buf;
+	
+	h.url=args[1];
+	h.setUserAgent(FBStream.userAgent);
+	h.setCookieJar(getCookiePath());
+	h.onReceive = (ubyte[] data){
+		buf~=cast(char[])data;
+		return data.length;
+	};
+	h.perform();
+	
+	if(FBStream.captchaSolved(buf)){
+		writeln("Captcha already solved :)");
+		return 0;
+	}
+	
+	auto url_regex=ctRegex!(".*(https://www.facebook.com/captcha/tfbimage.php[^\"]+).*");
+	auto url=matchFirst(buf, url_regex)[1];
+	auto datr_regex=ctRegex!(".*\"_js_datr\",\"([^\"]+)\".*");
+	auto datr=matchFirst(buf, datr_regex);
+	
+	h.setCookie("_js_datr="~datr[1]);
+	
+	
+	auto captcha_regex=ctRegex!(".*name=\"captcha_persist_data\" value=\"([^\"]+)\".*");
+	auto captcha_hash=matchFirst(buf, captcha_regex)[1];
+	
+	buf=null;
+	h.url=url;
+	h.perform();
+	
+	File f;
+	string file=buildPath(tempDir(),"fb2rss_captcha.png");
+	f.open(file, "w+");
+	scope(exit){
+		f.close();
+		remove(file);
+	}
+	f.write(buf);
+	f.close();
+	writeln("The captcha has been written to "~file);
+	writeln("Please enter the text below:");
+	char[] captcha;
+	readln(captcha);
+	captcha=captcha[0..$-1]; //Exclude '\n'
+	
+	buf=null;
+	h.url=args[1];
+	h.method=HTTP.Method.post;
+	h.setPostData(
+		format(
+			"captcha_persist_data=%s&captcha_response=%s&captcha_submit=1",
+			captcha_hash,
+			captcha
+		),
+		"application/x-www-form-urlencoded"
+	);
+	h.perform();
+	
+	if(FBStream.captchaSolved(buf)){
+		writeln("Success");
+	}
+	else{
+		writeln("Sorry, didn't work :C");
+		writeln("Please, try again!");
+		return 1;
+	}
+	return 0;
+}
diff --git a/fbstream.d b/fbstream.d
@@ -44,6 +44,23 @@ import drss.render;
 import kxml.xml;
 import std.typecons;
 
+	
+string getCookiePath(){
+	import std.path;
+	import standardpaths;
+	string base=writablePath(StandardPath.config);
+	return buildPath(base, "Fb2RSS_cookiejar.txt");
+}
+
+class CaptchaException : Exception{
+	this(string msg, string file=__FILE__, size_t line=__LINE__, Throwable next=null){
+		super(msg,file,line,next);
+	}
+	override string toString(){
+		return msg;
+	}
+}
+
 /**
  * Manages all the relevant tasks of 
  * $(UL
@@ -82,10 +99,24 @@ class FBStream : DRSS!(Post){
 		date_reliability=DateReliable.YES;
 		url=fetch_url;
 		
+		h.setCookieJar(getCookiePath());
+		
 		super(h);
 	}
 	
 	/**
+	* Returns wether the page in buf is already unlocked.
+	* 
+	* Params:
+	* 	buf =	The chararray of the page.
+	* Returns: True if the page is unlocked, false otherwise
+	*/
+	static bool captchaSolved(in char[] buf){
+		import std.algorithm.searching : canFind;
+		return !canFind(buf, "captcha");
+	}
+	
+	/**
 	 * Parses the document.
 	 * 
 	 * Params:
@@ -93,7 +124,19 @@ class FBStream : DRSS!(Post){
 	 */
 	override public void parse(string document){
 		XmlNode[] arr;
-		XmlNode root=readDocument(document);
+		XmlNode root;
+		try{
+			root=readDocument(document);
+		}
+		catch(Exception e){
+			if(!captchaSolved(document)){
+				throw new CaptchaException("Captcha has not been solved yet. "
+				"Please run the ./captcha utility");
+			}
+			else{
+				throw e;
+			}
+		}
 		arr=root.parseXPath(`//title`);
 		headers[1][1]=arr[0].getCData().idup;
 		headers[0][1]=url;
diff --git a/standardpaths b/standardpaths
@@ -0,0 +1 @@
+Subproject commit 4bc270dfdca83e5e6fe8d0558efd40cca28b57ca

	Fb2RSS A Facebook to RSS conversion tool
	git clone git://xatko.vsos.ethz.ch/Fb2RSS.git
	Log \| Files \| Refs \| Submodules

.gitmodules	\|	3	+++
Makefile	\|	8	++++++--
captcha.d	\|	87	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
fbstream.d	\|	45	++++++++++++++++++++++++++++++++++++++++++++-
standardpaths	\|	1	+