Fb2RSS

A Facebook to RSS conversion tool
git clone git://xatko.vsos.ethz.ch/Fb2RSS.git
Log | Files | Refs | Submodules | README

commit 78976dd5f0b050c15d58166d1a05be7356676e82
parent f59d0cc200111fce54a690f04bfe2dfd0c44073b
Author: Dominik Schmidt <dominik@schm1dt.ch>
Date:   Sun, 13 Sep 2020 15:59:48 +0200

Adapt to upstream changes

Diffstat:
MFb2RSS.d | 14+++++++++++++-
MMakefile | 2+-
Mcaptcha.d | 4++++
Mfbstream.d | 115+++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
4 files changed, 90 insertions(+), 45 deletions(-)

diff --git a/Fb2RSS.d b/Fb2RSS.d @@ -1,8 +1,20 @@ import std.stdio; import fbstream; +import std.algorithm: endsWith; +import std.regex; +import std.format; void main(string[] args){ - FBStream str=new FBStream(args[1]); + string url=args[1]; + auto reg = ctRegex!`([^:]+)://([^/]+)/(.*)`; + + auto m = url.matchFirst(reg); + if(!m){ + throw new Exception("Not an url"); + } + url = format("https://m.facebook.com/%s",m[3]); + + FBStream str=new FBStream(url); str.update(); str.writeRSS(stdout); } diff --git a/Makefile b/Makefile @@ -1,5 +1,5 @@ DMD?=dmd -OPTS?=-release -O +OPTS?=-g IOPTS=$(OPTS) -IDRSS/ -IDRSS/kxml/source/ -Istandardpaths/source/ all: Fb2RSS captcha diff --git a/captcha.d b/captcha.d @@ -6,6 +6,7 @@ import std.path; import std.regex; import std.format; import std.range; +import std.process; import std.algorithm.searching; @@ -60,8 +61,11 @@ int main(string[] args){ f.close(); writeln("The captcha has been written to "~file); writeln("Please enter the text below:"); + auto pid = spawnProcess(["/usr/bin/pqiv", "-i", file]); + char[] captcha; readln(captcha); + kill(pid); captcha=captcha[0..$-1]; //Exclude '\n' buf=null; diff --git a/fbstream.d b/fbstream.d @@ -43,8 +43,9 @@ import drss.rss; import drss.render; import kxml.xml; import std.typecons; +import std.conv; +import std.json; - string getCookiePath(){ import std.path; import standardpaths; @@ -61,6 +62,36 @@ class CaptchaException : Exception{ } } +JSONValue search(JSONValue tree, string id){ + with(JSONType) + switch(tree.type){ + case object: + auto o = tree.object; + if(id in o){ + return o[id]; + } + foreach(v; o.byValue){ + auto nv= search(v, id); + if(nv.type != null_){ + return nv; + } + } + break; + case array: + foreach(v; tree.array){ + auto nv= search(v, id); + if(nv.type != null_){ + return nv; + } + } + break; + default: + return JSONValue.init; + break; + } + return JSONValue.init; +} + /** * Manages all the relevant tasks of * $(UL @@ -82,7 +113,7 @@ class FBStream : DRSS!(Post){ * Facebook does check this, and if it doesn't know it, it displays an * "Update your Browser"-Message */ - static string userAgent="Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.04"; + static string userAgent="curl/7.72.0"; ///The RSS-Header to append. static string rss_header=`<?xml version="1.0" encoding="UTF-8" standalone="yes"?>`; @@ -99,7 +130,7 @@ class FBStream : DRSS!(Post){ date_reliability=DateReliable.YES; url=fetch_url; - h.setCookieJar(getCookiePath()); + //h.setCookieJar(getCookiePath()); super(h); } @@ -126,28 +157,8 @@ class FBStream : DRSS!(Post){ XmlNode[] arr; XmlNode root; - //Make the HTML valid for the parser - import std.regex; - /* - * Scripts aren't properly commented, so just replace them with comments - * We don't need them anyways - */ - auto script_start=ctRegex!"<script[^>]*>"; - auto script_end=ctRegex!"</script>"; - document=document.replaceAll(script_start, "<!--").replaceAll(script_end, "-->"); - - /* - * Now, since the exact class names of facebook always vary, we - * normalize them to a common denominatory - */ - auto userContent_normalize=ctRegex!`class="[^"]*(userContentWrapper|userContent)[^"]*"`; - document=document - .replaceAll(userContent_normalize, `class="$1"`); - - //Add important End-Tags - document~="</body></HTML>"; - root=readDocument(document); + if(!captchaSolved(document)){ throw new CaptchaException("Captcha has not been solved yet. " ~"Please run the ./captcha utility"); @@ -156,8 +167,9 @@ class FBStream : DRSS!(Post){ headers[1][1]=arr[0].getCData().idup; headers[0][1]=url; - XmlNode[] nodes=root.parseXPath(`//div[@class="userContentWrapper"]`); + XmlNode[] nodes=root.parseXPath(`//div[@id="recent"]`); assert(nodes.length>0, "No data nodes found!"); + nodes = nodes[0].getChildren()[0].getChildren()[0].getChildren(); foreach(node; nodes.retro){ appendPost(node); } @@ -168,21 +180,29 @@ class FBStream : DRSS!(Post){ * Params: match = The data-div node */ private void appendPost(XmlNode match){ - XmlNode[] usercontent=match.parseXPath(`//div[@class="userContent"]`); - if(usercontent.length==0){ - return; + XmlNode usercontent; + try{ + usercontent=match.parseXPath(`//div[@style]`)[0]; } - XmlNode[] translatediv=usercontent[0].parseXPath(`/div[@class="_43f9"]`); - if(translatediv.length>0){ - usercontent[0].removeChild(translatediv[0]); + catch(Exception e){ + return; } SysTime t=getPostTimestamp(match); - XmlNode[] href=match.parseXPath(`//a[@class="_5pcq"]`); + XmlNode[] href=match.parseXPath(`//a`); string hrefs; if(href.length!=0){ - hrefs=href[0].getAttribute("href"); + hrefs=href[$-1].getAttribute("href"); + /* + import std.regex; + auto re = ctRegex!"[^?]+"; + auto m = hrefs.matchFirst(re); + if(m){ + hrefs = m[0]; + } + */ } - addEntry(Post(usercontent[0],t,hrefs)); + assert(hrefs.length>0); + addEntry(Post(usercontent,t,hrefs)); } /** @@ -190,10 +210,11 @@ class FBStream : DRSS!(Post){ * */ private SysTime getPostTimestamp(XmlNode post){ - XmlNode[] matches=post.parseXPath(`//abbr[@data-utime]`); - assert(matches.length>0, "No date-utime node found in post"); - string time=matches[0].getAttribute("data-utime"); - return SysTime(unixTimeToStdTime(to!ulong(time))); + import std.json; + auto attr = post.getAttribute("data-ft"); + auto json = parseJSON(attr); + auto m = search(json, "publish_time"); + return SysTime(unixTimeToStdTime(m.integer)); } /** @@ -235,17 +256,25 @@ struct Post{ ///The count of characters, until the title gets cut off. static ushort title_cutoff=80; + static string plaintext(XmlNode n){ + Appender!string app = Appender!string(); + plaintext(n, app); + return app.data; + } + static void plaintext(XmlNode n, ref Appender!string app){ + app~=n.getCData(); + foreach(c; n.getChildren){ + plaintext(c, app); + } + } + /** * Return: The title of the posting * Bugs: title_cutoff is reached with fewer characters when there are * a lot of multibyte characters in the string. */ @property string title(){ - auto children=content.getChildren(); - if(children.length==0){ - return ""; - } - string cont=children[0].getCData(); + string cont=plaintext(content); if(cont.length>title_cutoff){ cont=cont[0..toUTFindex(cont,title_cutoff)]; cont~="...";