fbstream.d (8357B)
1 /** 2 * Fb2RSS is a translator from the HTML structure generated by Facebook to 3 * an atom feed. 4 * 5 * The page is formatted like this: 6 * $(UL 7 * $(LI The relevant data is inside `<code></code>` blocks) 8 * $(LI Inside these blocks is further HTML-Data, which is commented out.) 9 * $(LI The posting and metadata is inside a `<div></div>`, which has the date-time attribute set.) 10 * $(LI The actual text to the post is inside another `<div></div>`, with class="_5pbx userContent") 11 * $(LI The link to the Post is inside the href of `<a></a>` with class="_5pcq") 12 * ) 13 * 14 * Authors: Dominik Schmidt, das1993@hotmail.com 15 * 16 * License: 17 * Copyright (C) 2015 Dominik Schmidt <das1993@hotmail.com> 18 * 19 * This program is free software: you can redistribute it and/or modify 20 * it under the terms of the GNU General Public License as published by 21 * the Free Software Foundation, either version 3 of the License, or 22 * (at your option) any later version. 23 * 24 * This program is distributed in the hope that it will be useful, 25 * but WITHOUT ANY WARRANTY; without even the implied warranty of 26 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 27 * GNU General Public License for more details. 28 * 29 * You should have received a copy of the GNU General Public License 30 * along with this program. If not, see <http://www.gnu.org/licenses/>. 31 * 32 */ 33 module fbstream; 34 35 import std.net.curl; 36 import std.stdio; 37 import std.string; 38 import std.datetime : SysTime, unixTimeToStdTime; 39 import std.range; 40 import std.file; 41 import std.utf; 42 import drss.rss; 43 import drss.render; 44 import kxml.xml; 45 import std.typecons; 46 import std.conv; 47 import std.json; 48 49 string getCookiePath(){ 50 import std.path; 51 import standardpaths; 52 string base=writablePath(StandardPath.config); 53 return buildPath(base, "Fb2RSS_cookiejar.txt"); 54 } 55 56 class CaptchaException : Exception{ 57 this(string msg, string file=__FILE__, size_t line=__LINE__, Throwable next=null){ 58 super(msg,file,line,next); 59 } 60 override string toString(){ 61 return msg; 62 } 63 } 64 65 JSONValue search(JSONValue tree, string id){ 66 with(JSONType) 67 switch(tree.type){ 68 case object: 69 auto o = tree.object; 70 if(id in o){ 71 return o[id]; 72 } 73 foreach(v; o.byValue){ 74 auto nv= search(v, id); 75 if(nv.type != null_){ 76 return nv; 77 } 78 } 79 break; 80 case array: 81 foreach(v; tree.array){ 82 auto nv= search(v, id); 83 if(nv.type != null_){ 84 return nv; 85 } 86 } 87 break; 88 default: 89 return JSONValue.init; 90 break; 91 } 92 return JSONValue.init; 93 } 94 95 /** 96 * Manages all the relevant tasks of 97 * $(UL 98 * $(LI Fetching) 99 * $(LI Parsing) 100 * $(LI Formatting and Outputting) 101 * ) 102 */ 103 class FBStream : DRSS!(Post){ 104 ///Holds the url, where we get the data from. Can either be an URL or a filename. 105 private string fetch_url; 106 ///The plaintext string holding the whole file 107 char[] document; 108 109 DRSS_Header[] headers=[Tuple!(string,string)("url",null),Tuple!(string,string)("title",null)]; 110 111 /** 112 * The useragent to use for requesting the page with facebook. 113 * Facebook does check this, and if it doesn't know it, it displays an 114 * "Update your Browser"-Message 115 */ 116 static string userAgent="curl/7.72.0"; 117 118 ///The RSS-Header to append. 119 static string rss_header=`<?xml version="1.0" encoding="UTF-8" standalone="yes"?>`; 120 121 immutable string url; 122 123 /** 124 * Params: fetch_url = Fetch the Data from this source 125 */ 126 this(string fetch_url){ 127 auto h=HTTP(); 128 h.url=fetch_url; 129 h.setUserAgent(userAgent); 130 date_reliability=DateReliable.YES; 131 url=fetch_url; 132 133 //h.setCookieJar(getCookiePath()); 134 135 super(h); 136 } 137 138 /** 139 * Returns wether the page in buf is already unlocked. 140 * 141 * Params: 142 * buf = The chararray of the page. 143 * Returns: True if the page is unlocked, false otherwise 144 */ 145 static bool captchaSolved(in char[] buf){ 146 import std.algorithm.searching : canFind; 147 return !canFind(buf, "tfbimage.php?captcha_challenge_code"); 148 } 149 150 /** 151 * Parses the document. 152 * 153 * Params: 154 * document = The documentstring to parse. 155 */ 156 override public void parse(string document){ 157 XmlNode[] arr; 158 XmlNode root; 159 160 root=readDocument(document); 161 162 if(!captchaSolved(document)){ 163 throw new CaptchaException("Captcha has not been solved yet. " 164 ~"Please run the ./captcha utility"); 165 } 166 arr=root.parseXPath(`//title`); 167 headers[1][1]=arr[0].getCData().idup; 168 headers[0][1]=url; 169 170 XmlNode[] nodes=root.parseXPath(`//div[@id="recent"]`); 171 assert(nodes.length>0, "No data nodes found!"); 172 nodes = nodes[0].getChildren()[0].getChildren()[0].getChildren(); 173 foreach(node; nodes.retro){ 174 appendPost(node); 175 } 176 } 177 178 /** 179 * Gets the information from the data-div and appends it to #posts 180 * Params: match = The data-div node 181 */ 182 private void appendPost(XmlNode match){ 183 XmlNode usercontent; 184 try{ 185 usercontent=match.parseXPath(`//div[@style]`)[0]; 186 } 187 catch(Exception e){ 188 return; 189 } 190 SysTime t=getPostTimestamp(match); 191 XmlNode[] href=match.parseXPath(`//a`); 192 string hrefs; 193 if(href.length!=0){ 194 hrefs=href[$-1].getAttribute("href"); 195 /* 196 import std.regex; 197 auto re = ctRegex!"[^?]+"; 198 auto m = hrefs.matchFirst(re); 199 if(m){ 200 hrefs = m[0]; 201 } 202 */ 203 } 204 assert(hrefs.length>0); 205 addEntry(Post(usercontent,t,hrefs)); 206 } 207 208 /** 209 * Gets the timestamp of a post 210 * 211 */ 212 private SysTime getPostTimestamp(XmlNode post){ 213 import std.json; 214 auto attr = post.getAttribute("data-ft"); 215 auto json = parseJSON(attr); 216 auto m = search(json, "publish_time"); 217 return SysTime(unixTimeToStdTime(m.integer)); 218 } 219 220 /** 221 * Fetches the raw-data, either from File or from URL 222 */ 223 public override bool fetch(){ 224 if(exists(url) && isFile(url)){ 225 buffer=cast(ubyte[])read(url); 226 return true; 227 } 228 else{ 229 return super.fetch(); 230 } 231 } 232 233 /** 234 * Generates the RSS-file 235 * 236 * Params: 237 * f = the file to write the RSS-Document to. 238 */ 239 void writeRSS(File f){ 240 import drss.render; 241 XmlNode n=generateRSS(this,headers); 242 f.writeln(rss_header); 243 f.writeln(n); 244 } 245 246 } 247 248 /// 249 struct Post{ 250 ///The userdata `<div></div>` 251 XmlNode content; 252 ///The modification date 253 SysTime time; 254 ///The Post-href 255 string href; 256 ///The count of characters, until the title gets cut off. 257 static ushort title_cutoff=80; 258 259 static string plaintext(XmlNode n){ 260 Appender!string app = Appender!string(); 261 plaintext(n, app); 262 return app.data; 263 } 264 static void plaintext(XmlNode n, ref Appender!string app){ 265 app~=n.getCData(); 266 foreach(c; n.getChildren){ 267 plaintext(c, app); 268 } 269 } 270 271 /** 272 * Return: The title of the posting 273 * Bugs: title_cutoff is reached with fewer characters when there are 274 * a lot of multibyte characters in the string. 275 */ 276 @property string title(){ 277 string cont=plaintext(content); 278 if(cont.length>title_cutoff){ 279 cont=cont[0..toUTFindex(cont,title_cutoff)]; 280 cont~="..."; 281 } 282 return cont; 283 } 284 ///Returns: The link to the post. 285 @property string link() const{ 286 return "https://facebook.com"~href; 287 } 288 289 /** 290 * Returns: An unique id to the post 291 * Bugs: It should be something sensible here, not just the link. 292 * Optimally, it should be the same as the facebookfeed read. 293 */ 294 @property string id() const{ 295 return link(); 296 } 297 298 /// Returns: The Atom-valid datestring 299 @property string ISOTime() const{ 300 return time.toISOExtString(); 301 } 302 303 /// Returns: An UCData-Object describing the content of the post. 304 @property UCData getUCContent(){ 305 UCData uc=new UCData(); 306 uc.setCData(content.toString()); 307 return uc; 308 } 309 310 /** 311 * Compares the object with b by comparing the dates 312 * Returns: -1 if b is bigger, 1 if b is smaller, 0 if they're equal 313 */ 314 int opCmp(in ref Post b) const{ 315 if(time<b.time){ 316 return -1; 317 } 318 else if(time>b.time){ 319 return 1; 320 } 321 else{ 322 return 0; 323 } 324 } 325 326 /** 327 * Generates an Atom-Entry matching the post 328 * Returns: The Entry-Node for inclusion inside the Atom-Feed. 329 */ 330 XmlNode toXML(){ 331 XmlNode e=new XmlNode("entry"); 332 e.addChild(new XmlNode("title").addCData(title)); 333 e.addChild(new XmlNode("link").setAttribute("href",link)); 334 e.addChild(new XmlNode("id").addCData(id)); 335 e.addChild(new XmlNode("published").addCData(ISOTime())); 336 e.addChild(new XmlNode("content").setAttribute("type","html").addChild(getUCContent())); 337 return e; 338 } 339 /// 340 bool opEquals(in ref Post b) const{ 341 return (opCmp(b)==0); 342 } 343 /// 344 bool opEquals(in Post b) const{ 345 return (opCmp(b)==0); 346 } 347 }