From 1c0ab298e4068066a6fcb41b9f894d2c828880b2 Mon Sep 17 00:00:00 2001 From: Sude Date: Thu, 9 May 2024 14:37:53 +0300 Subject: [PATCH] Remove htmlcxx dependency and use libtidy with tinyxml2 to parse html --- .github/workflows/linux.yml | 2 +- CMakeLists.txt | 6 +-- README.md | 6 +-- cmake/FindHtmlcxx.cmake | 54 --------------------- cmake/FindTidy.cmake | 33 +++++++++++++ include/util.h | 3 ++ include/website.h | 1 - src/downloader.cpp | 34 ++++--------- src/util.cpp | 49 ++++++++++++++++++- src/website.cpp | 97 +++++++++++-------------------------- 10 files changed, 130 insertions(+), 155 deletions(-) delete mode 100644 cmake/FindHtmlcxx.cmake create mode 100644 cmake/FindTidy.cmake diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 7d6d81e..a286e1c 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -32,7 +32,7 @@ jobs: run: | sudo apt -y update sudo apt -y install ninja-build build-essential libcurl4-openssl-dev libboost-regex-dev \ - libjsoncpp-dev librhash-dev libtinyxml2-dev libhtmlcxx-dev \ + libjsoncpp-dev librhash-dev libtinyxml2-dev libtidy-dev \ libboost-system-dev libboost-filesystem-dev libboost-program-options-dev \ libboost-date-time-dev libboost-iostreams-dev help2man cmake \ pkg-config zlib1g-dev qtwebengine5-dev diff --git a/CMakeLists.txt b/CMakeLists.txt index 9f489c1..d63a757 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,11 +25,11 @@ find_package(Boost ) find_package(CURL 7.55.0 REQUIRED) find_package(Jsoncpp REQUIRED) -find_package(Htmlcxx REQUIRED) find_package(Tinyxml2 REQUIRED) find_package(Rhash REQUIRED) find_package(Threads REQUIRED) find_package(ZLIB REQUIRED) +find_package(Tidy REQUIRED) file(GLOB SRC_FILES main.cpp @@ -111,10 +111,10 @@ target_include_directories(${PROJECT_NAME} PRIVATE ${CURL_INCLUDE_DIRS} PRIVATE ${OAuth_INCLUDE_DIRS} PRIVATE ${Jsoncpp_INCLUDE_DIRS} - PRIVATE ${Htmlcxx_INCLUDE_DIRS} PRIVATE ${Tinyxml2_INCLUDE_DIRS} PRIVATE ${Rhash_INCLUDE_DIRS} PRIVATE ${ZLIB_INCLUDE_DIRS} + PRIVATE ${Tidy_INCLUDE_DIRS} ) target_link_libraries(${PROJECT_NAME} @@ -122,11 +122,11 @@ target_link_libraries(${PROJECT_NAME} PRIVATE ${CURL_LIBRARIES} PRIVATE ${OAuth_LIBRARIES} PRIVATE ${Jsoncpp_LIBRARIES} - PRIVATE ${Htmlcxx_LIBRARIES} PRIVATE ${Tinyxml2_LIBRARIES} PRIVATE ${Rhash_LIBRARIES} PRIVATE ${CMAKE_THREAD_LIBS_INIT} PRIVATE ${ZLIB_LIBRARIES} + PRIVATE ${Tidy_LIBRARIES} ) # Check if libatomic is needed in order to use std::atomic, and add diff --git a/README.md b/README.md index a6997bf..4dcd1a9 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ It uses the same API as GOG Galaxy which doesn't have Linux support at the momen * [libcurl](https://curl.haxx.se/libcurl/) >= 7.55.0 * [librhash](https://github.com/rhash/RHash) * [jsoncpp](https://github.com/open-source-parsers/jsoncpp) -* [htmlcxx](http://htmlcxx.sourceforge.net/) +* [libtidy](https://www.html-tidy.org/) * [tinyxml2](https://github.com/leethomason/tinyxml2) * [boost](http://www.boost.org/) (regex, date-time, system, filesystem, program-options, iostreams) * [zlib](https://www.zlib.net/) @@ -22,7 +22,7 @@ It uses the same API as GOG Galaxy which doesn't have Linux support at the momen ## Debian/Ubuntu # apt install build-essential libcurl4-openssl-dev libboost-regex-dev \ - libjsoncpp-dev librhash-dev libtinyxml2-dev libhtmlcxx-dev \ + libjsoncpp-dev librhash-dev libtinyxml2-dev libtidy-dev \ libboost-system-dev libboost-filesystem-dev libboost-program-options-dev \ libboost-date-time-dev libboost-iostreams-dev help2man cmake \ pkg-config zlib1g-dev qtwebengine5-dev ninja-build @@ -35,7 +35,7 @@ It uses the same API as GOG Galaxy which doesn't have Linux support at the momen ## Fedora ``` sudo dnf install cmake make gcc gcc-c++ glibc tinyxml2-devel rhash-devel \ -htmlcxx-devel tinyxml-devel jsoncpp-devel liboauth-devel libcurl-devel \ +libtidy-devel tinyxml-devel jsoncpp-devel liboauth-devel libcurl-devel \ boost-devel help2man ``` ### Build and Install diff --git a/cmake/FindHtmlcxx.cmake b/cmake/FindHtmlcxx.cmake deleted file mode 100644 index 6dbf4b1..0000000 --- a/cmake/FindHtmlcxx.cmake +++ /dev/null @@ -1,54 +0,0 @@ -# - Try to find htmlcxx -# -# Once done this will define -# Htmlcxx_FOUND - System has htmlcxx -# Htmlcxx_INCLUDE_DIRS - The htmlcxx include directories -# Htmlcxx_LIBRARIES - The libraries needed to use htmlcxx - -find_package(PkgConfig) -pkg_check_modules(PC_HTMLCXX REQUIRED htmlcxx) - -find_path(HTMLCXX_INCLUDE_DIR - NAMES - css/parser.h - html/tree.h - HINTS - ${PC_HTMLCXX_INCLUDEDIR} - ${PC_HTMLCXX_INCLUDE_DIRS} - PATH_SUFFIXES - htmlcxx - PATHS - ${PC_HTMLCXX_INCLUDE_DIRS} - ) - -find_library(HTMLCXX_LIBRARY_HTMLCXX htmlcxx - HINTS - ${PC_HTMLCXX_LIBDIR} - ${PC_HTMLCXX_LIBRARY_DIRS} - PATHS - ${PC_HTMLCXX_LIBRARY_DIRS} - ) - -find_library(HTMLCXX_LIBRARY_CSS_PARSER css_parser - HINTS - ${PC_HTMLCXX_LIBDIR} - ${PC_HTMLCXX_LIBRARY_DIRS} - PATHS - ${PC_HTMLCXX_LIBRARY_DIRS} - ) - -find_library(HTMLCXX_LIBRARY_CSS_PARSER_PP css_parser_pp - HINTS - ${PC_HTMLCXX_LIBDIR} - ${PC_HTMLCXX_LIBRARY_DIRS} - PATHS - ${PC_HTMLCXX_LIBRARY_DIRS} - ) - -mark_as_advanced(HTMLCXX_INCLUDE_DIR HTMLCXX_LIBRARY_HTMLCXX HTMLCXX_LIBRARY_CSS_PARSER HTMLCXX_LIBRARY_CSS_PARSER_PP) - -if(PC_HTMLCXX_FOUND) - set(Htmlcxx_FOUND ON) - set(Htmlcxx_INCLUDE_DIRS ${HTMLCXX_INCLUDE_DIR}) - set(Htmlcxx_LIBRARIES ${HTMLCXX_LIBRARY_HTMLCXX} ${HTMLCXX_LIBRARY_CSS_PARSER} ${HTMLCXX_LIBRARY_CSS_PARSER_PP}) -endif(PC_HTMLCXX_FOUND) diff --git a/cmake/FindTidy.cmake b/cmake/FindTidy.cmake new file mode 100644 index 0000000..f811b63 --- /dev/null +++ b/cmake/FindTidy.cmake @@ -0,0 +1,33 @@ +# - Try to find tidy +# +# Once done this will define +# Tidy_FOUND - System has tidy +# Tidy_INCLUDE_DIRS - The tidy include directories +# Tidy_LIBRARIES - The libraries needed to use tidy + +find_package(PkgConfig) +pkg_check_modules(PC_TIDY tidy) + +find_path(TIDY_INCLUDE_DIR tidy.h + HINTS + ${PC_TIDY_INCLUDEDIR} + ${PC_TIDY_INCLUDE_DIRS} + PATHS + ${PC_TIDY_INCLUDE_DIRS} + ) + +find_library(TIDY_LIBRARY tidy + HINTS + ${PC_TIDY_LIBDIR} + ${PC_TIDY_LIBRARY_DIRS} + PATHS + ${PC_TIDY_LIBRARY_DIRS} + ) + +mark_as_advanced(TIDY_INCLUDE_DIR TIDY_LIBRARY) + +if(TIDY_INCLUDE_DIR) + set(Tidy_FOUND ON) + set(Tidy_INCLUDE_DIRS ${TIDY_INCLUDE_DIR}) + set(Tidy_LIBRARIES ${TIDY_LIBRARY}) +endif(TIDY_INCLUDE_DIR) diff --git a/include/util.h b/include/util.h index 85234d0..b39c641 100644 --- a/include/util.h +++ b/include/util.h @@ -24,6 +24,7 @@ #include #include #include +#include typedef struct { @@ -104,6 +105,8 @@ namespace Util } Json::Value readJsonFile(const std::string& path); std::string transformGamename(const std::string& gamename); + std::string htmlToXhtml(const std::string& html); + tinyxml2::XMLNode* nextXMLNode(tinyxml2::XMLNode* node); } #endif // UTIL_H diff --git a/include/website.h b/include/website.h index 6cd7288..79ab742 100644 --- a/include/website.h +++ b/include/website.h @@ -32,7 +32,6 @@ class Website private: CURL* curlhandle; bool IsloggedInSimple(); - bool IsLoggedInComplex(const std::string& email); std::map getTagsFromJson(const Json::Value& json); int retries; std::string LoginGetAuthCode(const std::string& email, const std::string& password); diff --git a/src/downloader.cpp b/src/downloader.cpp index d8a706e..7f5fedf 100644 --- a/src/downloader.cpp +++ b/src/downloader.cpp @@ -24,8 +24,6 @@ #include #include #include -#include -#include #include #include #include @@ -1630,30 +1628,18 @@ std::string Downloader::getSerialsFromJSON(const Json::Value& json) } else { - htmlcxx::HTML::ParserDom parser; - tree dom = parser.parseTree(cdkey); - tree::iterator it = dom.begin(); - tree::iterator end = dom.end(); - for (; it != end; ++it) + std::string xhtml = Util::htmlToXhtml(cdkey); + tinyxml2::XMLDocument doc; + doc.Parse(xhtml.c_str()); + tinyxml2::XMLNode* node = doc.FirstChildElement("html"); + while(node) { - std::string tag_text; - if (it->tagName() == "span") - { - for (unsigned int j = 0; j < dom.number_of_children(it); ++j) - { - tree::iterator span_it = dom.child(it, j); - if (!span_it->isTag() && !span_it->isComment()) - tag_text = span_it->text(); - } - } + tinyxml2::XMLElement *element = node->ToElement(); + const char* text = element->GetText(); + if (text) + serials << text << std::endl; - if (!tag_text.empty()) - { - boost::regex expression("^\\h+|\\h+$"); - std::string text = boost::regex_replace(tag_text, expression, ""); - if (!text.empty()) - serials << text << std::endl; - } + node = Util::nextXMLNode(node); } } diff --git a/src/util.cpp b/src/util.cpp index 218f1c1..b46fd96 100644 --- a/src/util.cpp +++ b/src/util.cpp @@ -11,10 +11,11 @@ #include #include #include -#include #include #include #include +#include +#include /* Create filepath from specified directory and path @@ -1007,3 +1008,49 @@ std::string Util::transformGamename(const std::string& gamename) return gamename_transformed; } + +std::string Util::htmlToXhtml(const std::string& html) +{ + std::string xhtml; + TidyBuffer buffer = {0, 0, 0, 0, 0}; + int rc = -1; + TidyDoc doc = tidyCreate(); + + tidyOptSetBool(doc, TidyXhtmlOut, yes); + tidyOptSetBool(doc, TidyForceOutput, yes); + tidyOptSetInt(doc, TidyWrapLen, 0); + tidyOptSetInt(doc, TidyShowInfo, 0); + tidyOptSetInt(doc, TidyShowWarnings, 0); + rc = tidyParseString(doc, html.c_str()); + if ( rc >= 0 ) + rc = tidyCleanAndRepair(doc); + if ( rc >= 0 ) + rc = tidySaveBuffer(doc, &buffer); + + xhtml = std::string((char*)buffer.bp, buffer.size); + + tidyBufFree(&buffer); + tidyRelease(doc); + + return xhtml; +} + +tinyxml2::XMLNode* Util::nextXMLNode(tinyxml2::XMLNode* node) +{ + if (node->FirstChildElement()) // Has child element, go to first child + node = node->FirstChildElement(); + else if (node->NextSiblingElement()) // Has sibling element, go to first sibling + node = node->NextSiblingElement(); + else + { + // Go to parent node until it has sibling + while(node->Parent() && !node->Parent()->NextSiblingElement()) + node = node->Parent(); + if(node->Parent() && node->Parent()->NextSiblingElement()) + node = node->Parent()->NextSiblingElement(); + else // Reached the end + node = nullptr; + } + + return node; +} diff --git a/src/website.cpp b/src/website.cpp index 6c5ea80..3febb97 100644 --- a/src/website.cpp +++ b/src/website.cpp @@ -8,8 +8,8 @@ #include "globalconstants.h" #include "message.h" -#include #include +#include #ifdef USE_QT_GUI_LOGIN #include "gui_login.h" @@ -358,23 +358,27 @@ std::string Website::LoginGetAuthCodeCurl(const std::string& login_form_html, co std::string tagname_username = "login[username]"; std::string tagname_password = "login[password]"; std::string tagname_login = "login[login]"; - std::string tagname_token; + std::string tagname_token = "login[_token]"; - htmlcxx::HTML::ParserDom parser; - tree login_dom = parser.parseTree(login_form_html); - tree::iterator login_it = login_dom.begin(); - tree::iterator login_it_end = login_dom.end(); - for (; login_it != login_it_end; ++login_it) + std::string login_form_xhtml = Util::htmlToXhtml(login_form_html); + + tinyxml2::XMLDocument doc; + doc.Parse(login_form_xhtml.c_str()); + tinyxml2::XMLNode* node = doc.FirstChildElement("html"); + while(node) { - if (login_it->tagName()=="input") + tinyxml2::XMLElement *element = node->ToElement(); + if (element->Name() && !std::string(element->Name()).compare("input")) { - login_it->parseAttributes(); - if (login_it->attribute("id").second == "login__token") + std::string name = element->Attribute("name"); + if (name == tagname_token) { - token = login_it->attribute("value").second; // login token - tagname_token = login_it->attribute("name").second; + token = element->Attribute("value"); + break; } } + + node = Util::nextXMLNode(node); } if (token.empty()) @@ -419,25 +423,28 @@ std::string Website::LoginGetAuthCodeCurl(const std::string& login_form_html, co std::string tagname_two_step_auth_letter_2 = "second_step_authentication[token][letter_2]"; std::string tagname_two_step_auth_letter_3 = "second_step_authentication[token][letter_3]"; std::string tagname_two_step_auth_letter_4 = "second_step_authentication[token][letter_4]"; - std::string tagname_two_step_token; + std::string tagname_two_step_token = "second_step_authentication[_token]"; std::string token_two_step; std::string two_step_html = this->getResponse(redirect_url); redirect_url = ""; - tree two_step_dom = parser.parseTree(two_step_html); - tree::iterator two_step_it = two_step_dom.begin(); - tree::iterator two_step_it_end = two_step_dom.end(); - for (; two_step_it != two_step_it_end; ++two_step_it) + std::string two_step_xhtml = Util::htmlToXhtml(two_step_html); + doc.Parse(two_step_xhtml.c_str()); + node = doc.FirstChildElement("html"); + while(node) { - if (two_step_it->tagName()=="input") + tinyxml2::XMLElement *element = node->ToElement(); + if (element->Name() && !std::string(element->Name()).compare("input")) { - two_step_it->parseAttributes(); - if (two_step_it->attribute("id").second == "second_step_authentication__token") + std::string name = element->Attribute("name"); + if (name == tagname_two_step_token) { - token_two_step = two_step_it->attribute("value").second; // two step token - tagname_two_step_token = two_step_it->attribute("name").second; + token_two_step = element->Attribute("value"); + break; } } + + node = Util::nextXMLNode(node); } std::cerr << "Security code: "; @@ -568,52 +575,6 @@ bool Website::IsLoggedIn() return this->IsloggedInSimple(); } -/* Complex login check. Check login by checking email address on the account settings page. - returns true if we are logged in - returns false if we are not logged in -*/ -bool Website::IsLoggedInComplex(const std::string& email) -{ - bool bIsLoggedIn = false; - std::string html = this->getResponse("https://www.gog.com/account/settings/security"); - std::string email_lowercase = boost::algorithm::to_lower_copy(email); // boost::algorithm::to_lower does in-place modification but "email" is read-only so we need to make a copy of it - - htmlcxx::HTML::ParserDom parser; - tree dom = parser.parseTree(html); - tree::iterator it = dom.begin(); - tree::iterator end = dom.end(); - dom = parser.parseTree(html); - it = dom.begin(); - end = dom.end(); - for (; it != end; ++it) - { - if (it->tagName()=="strong") - { - it->parseAttributes(); - if (it->attribute("class").second == "settings-item__value settings-item__section") - { - for (unsigned int i = 0; i < dom.number_of_children(it); ++i) - { - tree::iterator tag_it = dom.child(it, i); - if (!tag_it->isTag() && !tag_it->isComment()) - { - std::string tag_text = boost::algorithm::to_lower_copy(tag_it->text()); - if (tag_text == email_lowercase) - { - bIsLoggedIn = true; // We are logged in - break; - } - } - } - } - } - if (bIsLoggedIn) // We are logged in so no need to go through the remaining tags - break; - } - - return bIsLoggedIn; -} - /* Simple login check. Check login by trying to get account page. If response code isn't 200 then login failed. returns true if we are logged in returns false if we are not logged in