Normalize the classnames with regexp before parsing the tree - Fb2RSS

commit 26bf5e68f8f4cad7a0a964b3d628c0df44e349d5
parent 70fd2b248f7181b480275f153018a5a190816b6e
Author: Dominik Schmidt <das1993@hotmail.com>
Date:   Tue, 24 Apr 2018 12:59:01 +0000

Normalize the classnames with regexp before parsing the tree

This is a hack since kxml does not support regex matching on attributes (or globbing),
and facebook changes the format of the classnames always slightly.

Diffstat:
fbstream.d  | 12 ++++++++++--

1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/fbstream.d b/fbstream.d
@@ -136,6 +136,14 @@ class FBStream : DRSS!(Post){
 		auto script_end=ctRegex!"</script>";
 		document=document.replaceAll(script_start, "<!--").replaceAll(script_end, "-->");
 		
+		/*
+		 * Now, since the exact class names of facebook always vary, we 
+		 * normalize them to a common denominatory
+		 */
+		auto userContent_normalize=ctRegex!`class="[^"]*(userContentWrapper|userContent)[^"]*"`;
+		document=document
+			.replaceAll(userContent_normalize, `class="$1"`);
+		
 		//Add important End-Tags
 		document~="</body></HTML>";
 		
@@ -148,7 +156,7 @@ class FBStream : DRSS!(Post){
 		headers[1][1]=arr[0].getCData().idup;
 		headers[0][1]=url;
 		
-		XmlNode[] nodes=root.parseXPath(`//div[@class="_5pcr userContentWrapper"]`);
+		XmlNode[] nodes=root.parseXPath(`//div[@class="userContentWrapper"]`);
 		assert(nodes.length>0, "No data nodes found!");
 		foreach(node; nodes.retro){
 			appendPost(node);
@@ -160,7 +168,7 @@ class FBStream : DRSS!(Post){
 	 * Params: match = The data-div node
 	 */
 	private void appendPost(XmlNode match){
-		XmlNode[] usercontent=match.parseXPath(`//div[@class="_5pbx userContent"]`);
+		XmlNode[] usercontent=match.parseXPath(`//div[@class="userContent"]`);
 		if(usercontent.length==0){
 			return;
 		}

	Fb2RSS A Facebook to RSS conversion tool
	git clone git://xatko.vsos.ethz.ch/Fb2RSS.git
	Log \| Files \| Refs \| Submodules