Remove htmlcxx dependency and use libtidy with tinyxml2 to parse html

This commit is contained in:
Sude 2024-05-09 14:37:53 +03:00
parent 1866f4c65c
commit 1c0ab298e4
10 changed files with 130 additions and 155 deletions

View File

@ -32,7 +32,7 @@ jobs:
run: |
sudo apt -y update
sudo apt -y install ninja-build build-essential libcurl4-openssl-dev libboost-regex-dev \
libjsoncpp-dev librhash-dev libtinyxml2-dev libhtmlcxx-dev \
libjsoncpp-dev librhash-dev libtinyxml2-dev libtidy-dev \
libboost-system-dev libboost-filesystem-dev libboost-program-options-dev \
libboost-date-time-dev libboost-iostreams-dev help2man cmake \
pkg-config zlib1g-dev qtwebengine5-dev

View File

@ -25,11 +25,11 @@ find_package(Boost
)
find_package(CURL 7.55.0 REQUIRED)
find_package(Jsoncpp REQUIRED)
find_package(Htmlcxx REQUIRED)
find_package(Tinyxml2 REQUIRED)
find_package(Rhash REQUIRED)
find_package(Threads REQUIRED)
find_package(ZLIB REQUIRED)
find_package(Tidy REQUIRED)
file(GLOB SRC_FILES
main.cpp
@ -111,10 +111,10 @@ target_include_directories(${PROJECT_NAME}
PRIVATE ${CURL_INCLUDE_DIRS}
PRIVATE ${OAuth_INCLUDE_DIRS}
PRIVATE ${Jsoncpp_INCLUDE_DIRS}
PRIVATE ${Htmlcxx_INCLUDE_DIRS}
PRIVATE ${Tinyxml2_INCLUDE_DIRS}
PRIVATE ${Rhash_INCLUDE_DIRS}
PRIVATE ${ZLIB_INCLUDE_DIRS}
PRIVATE ${Tidy_INCLUDE_DIRS}
)
target_link_libraries(${PROJECT_NAME}
@ -122,11 +122,11 @@ target_link_libraries(${PROJECT_NAME}
PRIVATE ${CURL_LIBRARIES}
PRIVATE ${OAuth_LIBRARIES}
PRIVATE ${Jsoncpp_LIBRARIES}
PRIVATE ${Htmlcxx_LIBRARIES}
PRIVATE ${Tinyxml2_LIBRARIES}
PRIVATE ${Rhash_LIBRARIES}
PRIVATE ${CMAKE_THREAD_LIBS_INIT}
PRIVATE ${ZLIB_LIBRARIES}
PRIVATE ${Tidy_LIBRARIES}
)
# Check if libatomic is needed in order to use std::atomic, and add

View File

@ -8,7 +8,7 @@ It uses the same API as GOG Galaxy which doesn't have Linux support at the momen
* [libcurl](https://curl.haxx.se/libcurl/) >= 7.55.0
* [librhash](https://github.com/rhash/RHash)
* [jsoncpp](https://github.com/open-source-parsers/jsoncpp)
* [htmlcxx](http://htmlcxx.sourceforge.net/)
* [libtidy](https://www.html-tidy.org/)
* [tinyxml2](https://github.com/leethomason/tinyxml2)
* [boost](http://www.boost.org/) (regex, date-time, system, filesystem, program-options, iostreams)
* [zlib](https://www.zlib.net/)
@ -22,7 +22,7 @@ It uses the same API as GOG Galaxy which doesn't have Linux support at the momen
## Debian/Ubuntu
# apt install build-essential libcurl4-openssl-dev libboost-regex-dev \
libjsoncpp-dev librhash-dev libtinyxml2-dev libhtmlcxx-dev \
libjsoncpp-dev librhash-dev libtinyxml2-dev libtidy-dev \
libboost-system-dev libboost-filesystem-dev libboost-program-options-dev \
libboost-date-time-dev libboost-iostreams-dev help2man cmake \
pkg-config zlib1g-dev qtwebengine5-dev ninja-build
@ -35,7 +35,7 @@ It uses the same API as GOG Galaxy which doesn't have Linux support at the momen
## Fedora
```
sudo dnf install cmake make gcc gcc-c++ glibc tinyxml2-devel rhash-devel \
htmlcxx-devel tinyxml-devel jsoncpp-devel liboauth-devel libcurl-devel \
libtidy-devel tinyxml-devel jsoncpp-devel liboauth-devel libcurl-devel \
boost-devel help2man
```
### Build and Install

View File

@ -1,54 +0,0 @@
# - Try to find htmlcxx
#
# Once done this will define
# Htmlcxx_FOUND - System has htmlcxx
# Htmlcxx_INCLUDE_DIRS - The htmlcxx include directories
# Htmlcxx_LIBRARIES - The libraries needed to use htmlcxx
find_package(PkgConfig)
pkg_check_modules(PC_HTMLCXX REQUIRED htmlcxx)
find_path(HTMLCXX_INCLUDE_DIR
NAMES
css/parser.h
html/tree.h
HINTS
${PC_HTMLCXX_INCLUDEDIR}
${PC_HTMLCXX_INCLUDE_DIRS}
PATH_SUFFIXES
htmlcxx
PATHS
${PC_HTMLCXX_INCLUDE_DIRS}
)
find_library(HTMLCXX_LIBRARY_HTMLCXX htmlcxx
HINTS
${PC_HTMLCXX_LIBDIR}
${PC_HTMLCXX_LIBRARY_DIRS}
PATHS
${PC_HTMLCXX_LIBRARY_DIRS}
)
find_library(HTMLCXX_LIBRARY_CSS_PARSER css_parser
HINTS
${PC_HTMLCXX_LIBDIR}
${PC_HTMLCXX_LIBRARY_DIRS}
PATHS
${PC_HTMLCXX_LIBRARY_DIRS}
)
find_library(HTMLCXX_LIBRARY_CSS_PARSER_PP css_parser_pp
HINTS
${PC_HTMLCXX_LIBDIR}
${PC_HTMLCXX_LIBRARY_DIRS}
PATHS
${PC_HTMLCXX_LIBRARY_DIRS}
)
mark_as_advanced(HTMLCXX_INCLUDE_DIR HTMLCXX_LIBRARY_HTMLCXX HTMLCXX_LIBRARY_CSS_PARSER HTMLCXX_LIBRARY_CSS_PARSER_PP)
if(PC_HTMLCXX_FOUND)
set(Htmlcxx_FOUND ON)
set(Htmlcxx_INCLUDE_DIRS ${HTMLCXX_INCLUDE_DIR})
set(Htmlcxx_LIBRARIES ${HTMLCXX_LIBRARY_HTMLCXX} ${HTMLCXX_LIBRARY_CSS_PARSER} ${HTMLCXX_LIBRARY_CSS_PARSER_PP})
endif(PC_HTMLCXX_FOUND)

33
cmake/FindTidy.cmake Normal file
View File

@ -0,0 +1,33 @@
# - Try to find tidy
#
# Once done this will define
# Tidy_FOUND - System has tidy
# Tidy_INCLUDE_DIRS - The tidy include directories
# Tidy_LIBRARIES - The libraries needed to use tidy
find_package(PkgConfig)
pkg_check_modules(PC_TIDY tidy)
find_path(TIDY_INCLUDE_DIR tidy.h
HINTS
${PC_TIDY_INCLUDEDIR}
${PC_TIDY_INCLUDE_DIRS}
PATHS
${PC_TIDY_INCLUDE_DIRS}
)
find_library(TIDY_LIBRARY tidy
HINTS
${PC_TIDY_LIBDIR}
${PC_TIDY_LIBRARY_DIRS}
PATHS
${PC_TIDY_LIBRARY_DIRS}
)
mark_as_advanced(TIDY_INCLUDE_DIR TIDY_LIBRARY)
if(TIDY_INCLUDE_DIR)
set(Tidy_FOUND ON)
set(Tidy_INCLUDE_DIRS ${TIDY_INCLUDE_DIR})
set(Tidy_LIBRARIES ${TIDY_LIBRARY})
endif(TIDY_INCLUDE_DIR)

View File

@ -24,6 +24,7 @@
#include <json/json.h>
#include <boost/date_time/posix_time/posix_time.hpp>
#include <curl/curl.h>
#include <tinyxml2.h>
typedef struct
{
@ -104,6 +105,8 @@ namespace Util
}
Json::Value readJsonFile(const std::string& path);
std::string transformGamename(const std::string& gamename);
std::string htmlToXhtml(const std::string& html);
tinyxml2::XMLNode* nextXMLNode(tinyxml2::XMLNode* node);
}
#endif // UTIL_H

View File

@ -32,7 +32,6 @@ class Website
private:
CURL* curlhandle;
bool IsloggedInSimple();
bool IsLoggedInComplex(const std::string& email);
std::map<std::string, std::string> getTagsFromJson(const Json::Value& json);
int retries;
std::string LoginGetAuthCode(const std::string& email, const std::string& password);

View File

@ -24,8 +24,6 @@
#include <boost/date_time/posix_time/posix_time.hpp>
#include <tinyxml2.h>
#include <json/json.h>
#include <htmlcxx/html/ParserDom.h>
#include <htmlcxx/html/Uri.h>
#include <termios.h>
#include <algorithm>
#include <thread>
@ -1630,30 +1628,18 @@ std::string Downloader::getSerialsFromJSON(const Json::Value& json)
}
else
{
htmlcxx::HTML::ParserDom parser;
tree<htmlcxx::HTML::Node> dom = parser.parseTree(cdkey);
tree<htmlcxx::HTML::Node>::iterator it = dom.begin();
tree<htmlcxx::HTML::Node>::iterator end = dom.end();
for (; it != end; ++it)
std::string xhtml = Util::htmlToXhtml(cdkey);
tinyxml2::XMLDocument doc;
doc.Parse(xhtml.c_str());
tinyxml2::XMLNode* node = doc.FirstChildElement("html");
while(node)
{
std::string tag_text;
if (it->tagName() == "span")
{
for (unsigned int j = 0; j < dom.number_of_children(it); ++j)
{
tree<htmlcxx::HTML::Node>::iterator span_it = dom.child(it, j);
if (!span_it->isTag() && !span_it->isComment())
tag_text = span_it->text();
}
}
tinyxml2::XMLElement *element = node->ToElement();
const char* text = element->GetText();
if (text)
serials << text << std::endl;
if (!tag_text.empty())
{
boost::regex expression("^\\h+|\\h+$");
std::string text = boost::regex_replace(tag_text, expression, "");
if (!text.empty())
serials << text << std::endl;
}
node = Util::nextXMLNode(node);
}
}

View File

@ -11,10 +11,11 @@
#include <boost/iostreams/filter/gzip.hpp>
#include <boost/iostreams/filtering_streambuf.hpp>
#include <boost/iostreams/copy.hpp>
#include <tinyxml2.h>
#include <json/json.h>
#include <fstream>
#include <sys/ioctl.h>
#include <tidy.h>
#include <tidybuffio.h>
/*
Create filepath from specified directory and path
@ -1007,3 +1008,49 @@ std::string Util::transformGamename(const std::string& gamename)
return gamename_transformed;
}
std::string Util::htmlToXhtml(const std::string& html)
{
std::string xhtml;
TidyBuffer buffer = {0, 0, 0, 0, 0};
int rc = -1;
TidyDoc doc = tidyCreate();
tidyOptSetBool(doc, TidyXhtmlOut, yes);
tidyOptSetBool(doc, TidyForceOutput, yes);
tidyOptSetInt(doc, TidyWrapLen, 0);
tidyOptSetInt(doc, TidyShowInfo, 0);
tidyOptSetInt(doc, TidyShowWarnings, 0);
rc = tidyParseString(doc, html.c_str());
if ( rc >= 0 )
rc = tidyCleanAndRepair(doc);
if ( rc >= 0 )
rc = tidySaveBuffer(doc, &buffer);
xhtml = std::string((char*)buffer.bp, buffer.size);
tidyBufFree(&buffer);
tidyRelease(doc);
return xhtml;
}
tinyxml2::XMLNode* Util::nextXMLNode(tinyxml2::XMLNode* node)
{
if (node->FirstChildElement()) // Has child element, go to first child
node = node->FirstChildElement();
else if (node->NextSiblingElement()) // Has sibling element, go to first sibling
node = node->NextSiblingElement();
else
{
// Go to parent node until it has sibling
while(node->Parent() && !node->Parent()->NextSiblingElement())
node = node->Parent();
if(node->Parent() && node->Parent()->NextSiblingElement())
node = node->Parent()->NextSiblingElement();
else // Reached the end
node = nullptr;
}
return node;
}

View File

@ -8,8 +8,8 @@
#include "globalconstants.h"
#include "message.h"
#include <htmlcxx/html/ParserDom.h>
#include <boost/algorithm/string/case_conv.hpp>
#include <tinyxml2.h>
#ifdef USE_QT_GUI_LOGIN
#include "gui_login.h"
@ -358,23 +358,27 @@ std::string Website::LoginGetAuthCodeCurl(const std::string& login_form_html, co
std::string tagname_username = "login[username]";
std::string tagname_password = "login[password]";
std::string tagname_login = "login[login]";
std::string tagname_token;
std::string tagname_token = "login[_token]";
htmlcxx::HTML::ParserDom parser;
tree<htmlcxx::HTML::Node> login_dom = parser.parseTree(login_form_html);
tree<htmlcxx::HTML::Node>::iterator login_it = login_dom.begin();
tree<htmlcxx::HTML::Node>::iterator login_it_end = login_dom.end();
for (; login_it != login_it_end; ++login_it)
std::string login_form_xhtml = Util::htmlToXhtml(login_form_html);
tinyxml2::XMLDocument doc;
doc.Parse(login_form_xhtml.c_str());
tinyxml2::XMLNode* node = doc.FirstChildElement("html");
while(node)
{
if (login_it->tagName()=="input")
tinyxml2::XMLElement *element = node->ToElement();
if (element->Name() && !std::string(element->Name()).compare("input"))
{
login_it->parseAttributes();
if (login_it->attribute("id").second == "login__token")
std::string name = element->Attribute("name");
if (name == tagname_token)
{
token = login_it->attribute("value").second; // login token
tagname_token = login_it->attribute("name").second;
token = element->Attribute("value");
break;
}
}
node = Util::nextXMLNode(node);
}
if (token.empty())
@ -419,25 +423,28 @@ std::string Website::LoginGetAuthCodeCurl(const std::string& login_form_html, co
std::string tagname_two_step_auth_letter_2 = "second_step_authentication[token][letter_2]";
std::string tagname_two_step_auth_letter_3 = "second_step_authentication[token][letter_3]";
std::string tagname_two_step_auth_letter_4 = "second_step_authentication[token][letter_4]";
std::string tagname_two_step_token;
std::string tagname_two_step_token = "second_step_authentication[_token]";
std::string token_two_step;
std::string two_step_html = this->getResponse(redirect_url);
redirect_url = "";
tree<htmlcxx::HTML::Node> two_step_dom = parser.parseTree(two_step_html);
tree<htmlcxx::HTML::Node>::iterator two_step_it = two_step_dom.begin();
tree<htmlcxx::HTML::Node>::iterator two_step_it_end = two_step_dom.end();
for (; two_step_it != two_step_it_end; ++two_step_it)
std::string two_step_xhtml = Util::htmlToXhtml(two_step_html);
doc.Parse(two_step_xhtml.c_str());
node = doc.FirstChildElement("html");
while(node)
{
if (two_step_it->tagName()=="input")
tinyxml2::XMLElement *element = node->ToElement();
if (element->Name() && !std::string(element->Name()).compare("input"))
{
two_step_it->parseAttributes();
if (two_step_it->attribute("id").second == "second_step_authentication__token")
std::string name = element->Attribute("name");
if (name == tagname_two_step_token)
{
token_two_step = two_step_it->attribute("value").second; // two step token
tagname_two_step_token = two_step_it->attribute("name").second;
token_two_step = element->Attribute("value");
break;
}
}
node = Util::nextXMLNode(node);
}
std::cerr << "Security code: ";
@ -568,52 +575,6 @@ bool Website::IsLoggedIn()
return this->IsloggedInSimple();
}
/* Complex login check. Check login by checking email address on the account settings page.
returns true if we are logged in
returns false if we are not logged in
*/
bool Website::IsLoggedInComplex(const std::string& email)
{
bool bIsLoggedIn = false;
std::string html = this->getResponse("https://www.gog.com/account/settings/security");
std::string email_lowercase = boost::algorithm::to_lower_copy(email); // boost::algorithm::to_lower does in-place modification but "email" is read-only so we need to make a copy of it
htmlcxx::HTML::ParserDom parser;
tree<htmlcxx::HTML::Node> dom = parser.parseTree(html);
tree<htmlcxx::HTML::Node>::iterator it = dom.begin();
tree<htmlcxx::HTML::Node>::iterator end = dom.end();
dom = parser.parseTree(html);
it = dom.begin();
end = dom.end();
for (; it != end; ++it)
{
if (it->tagName()=="strong")
{
it->parseAttributes();
if (it->attribute("class").second == "settings-item__value settings-item__section")
{
for (unsigned int i = 0; i < dom.number_of_children(it); ++i)
{
tree<htmlcxx::HTML::Node>::iterator tag_it = dom.child(it, i);
if (!tag_it->isTag() && !tag_it->isComment())
{
std::string tag_text = boost::algorithm::to_lower_copy(tag_it->text());
if (tag_text == email_lowercase)
{
bIsLoggedIn = true; // We are logged in
break;
}
}
}
}
}
if (bIsLoggedIn) // We are logged in so no need to go through the remaining tags
break;
}
return bIsLoggedIn;
}
/* Simple login check. Check login by trying to get account page. If response code isn't 200 then login failed.
returns true if we are logged in
returns false if we are not logged in