Fb2RSS

git clone git://xatko.vsos.ethz.ch/Fb2RSS.git
Log | Files | Refs | Submodules | README

fbstream.d (8357B)


      1 /**
      2  * Fb2RSS is a translator from the HTML structure generated by Facebook to
      3  * an atom feed.
      4  * 
      5  * The page is formatted like this:
      6  * $(UL
      7  * $(LI The relevant data is inside `<code></code>` blocks)
      8  * $(LI Inside these blocks is further HTML-Data, which is commented out.)
      9  * $(LI The posting and metadata is inside a `<div></div>`, which has the date-time attribute set.)
     10  * $(LI The actual text to the post is inside another `<div></div>`, with class="_5pbx userContent")
     11  * $(LI The link to the Post is inside the href of `<a></a>` with class="_5pcq")
     12  * )
     13  * 
     14  * Authors: Dominik Schmidt, das1993@hotmail.com
     15  * 
     16  * License: 
     17  * Copyright (C) 2015  Dominik Schmidt <das1993@hotmail.com>
     18  *
     19  * This program is free software: you can redistribute it and/or modify
     20  * it under the terms of the GNU General Public License as published by
     21  * the Free Software Foundation, either version 3 of the License, or
     22  * (at your option) any later version.
     23  *
     24  * This program is distributed in the hope that it will be useful,
     25  * but WITHOUT ANY WARRANTY; without even the implied warranty of
     26  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     27  * GNU General Public License for more details.
     28  *
     29  * You should have received a copy of the GNU General Public License
     30  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
     31  *
     32  */ 
     33 module fbstream;
     34 
     35 import std.net.curl;
     36 import std.stdio;
     37 import std.string;
     38 import std.datetime : SysTime, unixTimeToStdTime;
     39 import std.range;
     40 import std.file;
     41 import std.utf;
     42 import drss.rss;
     43 import drss.render;
     44 import kxml.xml;
     45 import std.typecons;
     46 import std.conv;
     47 import std.json;
     48 
     49 string getCookiePath(){
     50 	import std.path;
     51 	import standardpaths;
     52 	string base=writablePath(StandardPath.config);
     53 	return buildPath(base, "Fb2RSS_cookiejar.txt");
     54 }
     55 
     56 class CaptchaException : Exception{
     57 	this(string msg, string file=__FILE__, size_t line=__LINE__, Throwable next=null){
     58 		super(msg,file,line,next);
     59 	}
     60 	override string toString(){
     61 		return msg;
     62 	}
     63 }
     64 
     65 JSONValue search(JSONValue tree, string id){
     66 	with(JSONType)
     67 	switch(tree.type){
     68 		case object:
     69 			auto o = tree.object;
     70 			if(id in o){
     71 				return o[id];
     72 			}
     73 			foreach(v; o.byValue){
     74 				auto nv= search(v, id);
     75 				if(nv.type != null_){
     76 					return nv;
     77 				}
     78 			}
     79 		break;
     80 		case array:
     81 			foreach(v; tree.array){
     82 				auto nv= search(v, id);
     83 				if(nv.type != null_){
     84 					return nv;
     85 				}
     86 			}
     87 		break;
     88 		default:
     89 			return JSONValue.init;
     90 		break;
     91 	}
     92 	return JSONValue.init;
     93 }
     94 
     95 /**
     96  * Manages all the relevant tasks of 
     97  * $(UL
     98  * $(LI Fetching)
     99  * $(LI Parsing)
    100  * $(LI Formatting and Outputting)
    101  * )
    102  */
    103 class FBStream : DRSS!(Post){
    104 	///Holds the url, where we get the data from. Can either be an URL or a filename.
    105 	private string fetch_url;
    106 	///The plaintext string holding the whole file
    107 	char[] document;
    108 	
    109 	DRSS_Header[] headers=[Tuple!(string,string)("url",null),Tuple!(string,string)("title",null)];
    110 	
    111 	/**
    112 	 * The useragent to use for requesting the page with facebook.
    113 	 * Facebook does check this, and if it doesn't know it, it displays an
    114 	 * "Update your Browser"-Message
    115 	 */
    116 	static string userAgent="curl/7.72.0";
    117 	
    118 	///The RSS-Header to append.
    119 	static string rss_header=`<?xml version="1.0" encoding="UTF-8" standalone="yes"?>`;
    120 	
    121 	immutable string url;
    122 	
    123 	/**
    124 	 *	Params: fetch_url = Fetch the Data from this source
    125 	 */
    126 	this(string fetch_url){
    127 		auto h=HTTP();
    128 		h.url=fetch_url;
    129 		h.setUserAgent(userAgent);
    130 		date_reliability=DateReliable.YES;
    131 		url=fetch_url;
    132 		
    133 		//h.setCookieJar(getCookiePath());
    134 		
    135 		super(h);
    136 	}
    137 	
    138 	/**
    139 	* Returns wether the page in buf is already unlocked.
    140 	* 
    141 	* Params:
    142 	* 	buf =	The chararray of the page.
    143 	* Returns: True if the page is unlocked, false otherwise
    144 	*/
    145 	static bool captchaSolved(in char[] buf){
    146 		import std.algorithm.searching : canFind;
    147 		return !canFind(buf, "tfbimage.php?captcha_challenge_code");
    148 	}
    149 	
    150 	/**
    151 	 * Parses the document.
    152 	 * 
    153 	 * Params:
    154 	 * 	document = The documentstring to parse.
    155 	 */
    156 	override public void parse(string document){
    157 		XmlNode[] arr;
    158 		XmlNode root;
    159 		
    160 		root=readDocument(document);
    161 		
    162 		if(!captchaSolved(document)){
    163 			throw new CaptchaException("Captcha has not been solved yet. "
    164 			~"Please run the ./captcha utility");
    165 		}
    166 		arr=root.parseXPath(`//title`);
    167 		headers[1][1]=arr[0].getCData().idup;
    168 		headers[0][1]=url;
    169 		
    170 		XmlNode[] nodes=root.parseXPath(`//div[@id="recent"]`);
    171 		assert(nodes.length>0, "No data nodes found!");
    172 		nodes = nodes[0].getChildren()[0].getChildren()[0].getChildren();
    173 		foreach(node; nodes.retro){
    174 			appendPost(node);
    175 		}
    176 	}
    177 	
    178 	/**
    179 	 * Gets the information from the data-div and appends it to #posts
    180 	 * Params: match = The data-div node
    181 	 */
    182 	private void appendPost(XmlNode match){
    183 		XmlNode usercontent;
    184 		try{
    185 			usercontent=match.parseXPath(`//div[@style]`)[0];
    186 		}
    187 		catch(Exception e){
    188 			return;
    189 		}
    190 		SysTime t=getPostTimestamp(match);
    191 		XmlNode[] href=match.parseXPath(`//a`);
    192 		string hrefs;
    193 		if(href.length!=0){
    194 			hrefs=href[$-1].getAttribute("href");
    195 			/*
    196 			import std.regex;
    197 			auto re = ctRegex!"[^?]+";
    198 			auto m = hrefs.matchFirst(re);
    199 			if(m){
    200 				hrefs = m[0];
    201 			}
    202 			*/
    203 		}
    204 		assert(hrefs.length>0);
    205 		addEntry(Post(usercontent,t,hrefs));
    206 	}
    207 	
    208 	/**
    209 	 * Gets the timestamp of a post
    210 	 * 
    211 	 */
    212 	 private SysTime getPostTimestamp(XmlNode post){
    213 		import std.json;
    214 		auto attr = post.getAttribute("data-ft");
    215 		auto json = parseJSON(attr);
    216 		auto m = search(json, "publish_time");
    217 		return SysTime(unixTimeToStdTime(m.integer));
    218 	 }
    219 	
    220 	/**
    221 	 * Fetches the raw-data, either from File or from URL
    222 	 */
    223 	public override bool fetch(){
    224 		if(exists(url) && isFile(url)){
    225 			buffer=cast(ubyte[])read(url);
    226 			return true;
    227 		}
    228 		else{
    229 			return super.fetch();
    230 		}
    231 	}
    232 	
    233 	/**
    234 	 * Generates the RSS-file
    235 	 * 
    236 	 * Params:
    237 	 * 	f = the file to write the RSS-Document to.
    238 	 */
    239 	void writeRSS(File f){
    240 		import drss.render;
    241 		XmlNode n=generateRSS(this,headers);
    242 		f.writeln(rss_header);
    243 		f.writeln(n);
    244 	}
    245 	
    246 }
    247 
    248 ///
    249 struct Post{
    250 	///The userdata `<div></div>`
    251 	XmlNode content;
    252 	///The modification date 
    253 	SysTime time;
    254 	///The Post-href
    255 	string href;
    256 	///The count of characters, until the title gets cut off.
    257 	static ushort title_cutoff=80;
    258 	
    259 	static string plaintext(XmlNode n){
    260 		Appender!string app = Appender!string();
    261 		plaintext(n, app);
    262 		return app.data;
    263 	}
    264 	static void plaintext(XmlNode n, ref Appender!string app){
    265 		app~=n.getCData();
    266 		foreach(c; n.getChildren){
    267 			plaintext(c, app);
    268 		}
    269 	}
    270 	
    271 	/**
    272 	 * Return: The title of the posting 
    273 	 * Bugs: title_cutoff is reached with fewer characters when there are 
    274 	 * 	a lot of multibyte characters in the string.
    275 	 */
    276 	@property string title(){
    277 		string cont=plaintext(content);
    278 		if(cont.length>title_cutoff){
    279 			cont=cont[0..toUTFindex(cont,title_cutoff)];
    280 			cont~="...";
    281 		}
    282 		return cont;
    283 	}
    284 	///Returns: The link to the post.
    285 	@property string link() const{
    286 		return "https://facebook.com"~href;
    287 	}
    288 	
    289 	/**
    290 	 * Returns: An unique id to the post
    291 	 * Bugs: It should be something sensible here, not just the link.
    292 	 * 		Optimally, it should be the same as the facebookfeed read.
    293 	 */
    294 	@property string id() const{
    295 		return link();
    296 	}
    297 	
    298 	/// Returns: The Atom-valid datestring
    299 	@property string ISOTime() const{
    300 		return time.toISOExtString();
    301 	}
    302 	
    303 	/// Returns: An UCData-Object describing the content of the post.
    304 	@property UCData getUCContent(){
    305 		UCData uc=new UCData();
    306 		uc.setCData(content.toString());
    307 		return uc;
    308 	}
    309 
    310 	/**
    311 	 * Compares the object with b by comparing the dates
    312 	 * Returns: -1 if b is bigger, 1 if b is smaller, 0 if they're equal
    313 	 */
    314 	int opCmp(in ref Post b) const{
    315 		if(time<b.time){
    316 			return -1;
    317 		}
    318 		else if(time>b.time){
    319 			return 1;
    320 		}
    321 		else{
    322 			return 0;
    323 		}
    324 	}
    325 	
    326 	/**
    327 	 * Generates an Atom-Entry matching the post
    328 	 * Returns: The Entry-Node for inclusion inside the Atom-Feed.
    329 	 */
    330 	XmlNode toXML(){
    331 		XmlNode e=new XmlNode("entry");
    332 		e.addChild(new XmlNode("title").addCData(title));
    333 		e.addChild(new XmlNode("link").setAttribute("href",link));
    334 		e.addChild(new XmlNode("id").addCData(id));
    335 		e.addChild(new XmlNode("published").addCData(ISOTime()));
    336 		e.addChild(new XmlNode("content").setAttribute("type","html").addChild(getUCContent()));
    337 		return e;
    338 	}
    339 	///
    340 	bool opEquals(in ref Post b) const{
    341 		return (opCmp(b)==0);
    342 	}
    343 	///
    344 	bool opEquals(in Post b) const{
    345 		return (opCmp(b)==0);
    346 	}
    347 }