Remove htmlcxx dependency and use libtidy with tinyxml2 to parse html

This commit is contained in:
Sude 2024-05-09 14:37:53 +03:00
parent 1866f4c65c
commit 1c0ab298e4
10 changed files with 130 additions and 155 deletions

View File

@ -32,7 +32,7 @@ jobs:
run: | run: |
sudo apt -y update sudo apt -y update
sudo apt -y install ninja-build build-essential libcurl4-openssl-dev libboost-regex-dev \ sudo apt -y install ninja-build build-essential libcurl4-openssl-dev libboost-regex-dev \
libjsoncpp-dev librhash-dev libtinyxml2-dev libhtmlcxx-dev \ libjsoncpp-dev librhash-dev libtinyxml2-dev libtidy-dev \
libboost-system-dev libboost-filesystem-dev libboost-program-options-dev \ libboost-system-dev libboost-filesystem-dev libboost-program-options-dev \
libboost-date-time-dev libboost-iostreams-dev help2man cmake \ libboost-date-time-dev libboost-iostreams-dev help2man cmake \
pkg-config zlib1g-dev qtwebengine5-dev pkg-config zlib1g-dev qtwebengine5-dev

View File

@ -25,11 +25,11 @@ find_package(Boost
) )
find_package(CURL 7.55.0 REQUIRED) find_package(CURL 7.55.0 REQUIRED)
find_package(Jsoncpp REQUIRED) find_package(Jsoncpp REQUIRED)
find_package(Htmlcxx REQUIRED)
find_package(Tinyxml2 REQUIRED) find_package(Tinyxml2 REQUIRED)
find_package(Rhash REQUIRED) find_package(Rhash REQUIRED)
find_package(Threads REQUIRED) find_package(Threads REQUIRED)
find_package(ZLIB REQUIRED) find_package(ZLIB REQUIRED)
find_package(Tidy REQUIRED)
file(GLOB SRC_FILES file(GLOB SRC_FILES
main.cpp main.cpp
@ -111,10 +111,10 @@ target_include_directories(${PROJECT_NAME}
PRIVATE ${CURL_INCLUDE_DIRS} PRIVATE ${CURL_INCLUDE_DIRS}
PRIVATE ${OAuth_INCLUDE_DIRS} PRIVATE ${OAuth_INCLUDE_DIRS}
PRIVATE ${Jsoncpp_INCLUDE_DIRS} PRIVATE ${Jsoncpp_INCLUDE_DIRS}
PRIVATE ${Htmlcxx_INCLUDE_DIRS}
PRIVATE ${Tinyxml2_INCLUDE_DIRS} PRIVATE ${Tinyxml2_INCLUDE_DIRS}
PRIVATE ${Rhash_INCLUDE_DIRS} PRIVATE ${Rhash_INCLUDE_DIRS}
PRIVATE ${ZLIB_INCLUDE_DIRS} PRIVATE ${ZLIB_INCLUDE_DIRS}
PRIVATE ${Tidy_INCLUDE_DIRS}
) )
target_link_libraries(${PROJECT_NAME} target_link_libraries(${PROJECT_NAME}
@ -122,11 +122,11 @@ target_link_libraries(${PROJECT_NAME}
PRIVATE ${CURL_LIBRARIES} PRIVATE ${CURL_LIBRARIES}
PRIVATE ${OAuth_LIBRARIES} PRIVATE ${OAuth_LIBRARIES}
PRIVATE ${Jsoncpp_LIBRARIES} PRIVATE ${Jsoncpp_LIBRARIES}
PRIVATE ${Htmlcxx_LIBRARIES}
PRIVATE ${Tinyxml2_LIBRARIES} PRIVATE ${Tinyxml2_LIBRARIES}
PRIVATE ${Rhash_LIBRARIES} PRIVATE ${Rhash_LIBRARIES}
PRIVATE ${CMAKE_THREAD_LIBS_INIT} PRIVATE ${CMAKE_THREAD_LIBS_INIT}
PRIVATE ${ZLIB_LIBRARIES} PRIVATE ${ZLIB_LIBRARIES}
PRIVATE ${Tidy_LIBRARIES}
) )
# Check if libatomic is needed in order to use std::atomic, and add # Check if libatomic is needed in order to use std::atomic, and add

View File

@ -8,7 +8,7 @@ It uses the same API as GOG Galaxy which doesn't have Linux support at the momen
* [libcurl](https://curl.haxx.se/libcurl/) >= 7.55.0 * [libcurl](https://curl.haxx.se/libcurl/) >= 7.55.0
* [librhash](https://github.com/rhash/RHash) * [librhash](https://github.com/rhash/RHash)
* [jsoncpp](https://github.com/open-source-parsers/jsoncpp) * [jsoncpp](https://github.com/open-source-parsers/jsoncpp)
* [htmlcxx](http://htmlcxx.sourceforge.net/) * [libtidy](https://www.html-tidy.org/)
* [tinyxml2](https://github.com/leethomason/tinyxml2) * [tinyxml2](https://github.com/leethomason/tinyxml2)
* [boost](http://www.boost.org/) (regex, date-time, system, filesystem, program-options, iostreams) * [boost](http://www.boost.org/) (regex, date-time, system, filesystem, program-options, iostreams)
* [zlib](https://www.zlib.net/) * [zlib](https://www.zlib.net/)
@ -22,7 +22,7 @@ It uses the same API as GOG Galaxy which doesn't have Linux support at the momen
## Debian/Ubuntu ## Debian/Ubuntu
# apt install build-essential libcurl4-openssl-dev libboost-regex-dev \ # apt install build-essential libcurl4-openssl-dev libboost-regex-dev \
libjsoncpp-dev librhash-dev libtinyxml2-dev libhtmlcxx-dev \ libjsoncpp-dev librhash-dev libtinyxml2-dev libtidy-dev \
libboost-system-dev libboost-filesystem-dev libboost-program-options-dev \ libboost-system-dev libboost-filesystem-dev libboost-program-options-dev \
libboost-date-time-dev libboost-iostreams-dev help2man cmake \ libboost-date-time-dev libboost-iostreams-dev help2man cmake \
pkg-config zlib1g-dev qtwebengine5-dev ninja-build pkg-config zlib1g-dev qtwebengine5-dev ninja-build
@ -35,7 +35,7 @@ It uses the same API as GOG Galaxy which doesn't have Linux support at the momen
## Fedora ## Fedora
``` ```
sudo dnf install cmake make gcc gcc-c++ glibc tinyxml2-devel rhash-devel \ sudo dnf install cmake make gcc gcc-c++ glibc tinyxml2-devel rhash-devel \
htmlcxx-devel tinyxml-devel jsoncpp-devel liboauth-devel libcurl-devel \ libtidy-devel tinyxml-devel jsoncpp-devel liboauth-devel libcurl-devel \
boost-devel help2man boost-devel help2man
``` ```
### Build and Install ### Build and Install

View File

@ -1,54 +0,0 @@
# - Try to find htmlcxx
#
# Once done this will define
# Htmlcxx_FOUND - System has htmlcxx
# Htmlcxx_INCLUDE_DIRS - The htmlcxx include directories
# Htmlcxx_LIBRARIES - The libraries needed to use htmlcxx
find_package(PkgConfig)
pkg_check_modules(PC_HTMLCXX REQUIRED htmlcxx)
find_path(HTMLCXX_INCLUDE_DIR
NAMES
css/parser.h
html/tree.h
HINTS
${PC_HTMLCXX_INCLUDEDIR}
${PC_HTMLCXX_INCLUDE_DIRS}
PATH_SUFFIXES
htmlcxx
PATHS
${PC_HTMLCXX_INCLUDE_DIRS}
)
find_library(HTMLCXX_LIBRARY_HTMLCXX htmlcxx
HINTS
${PC_HTMLCXX_LIBDIR}
${PC_HTMLCXX_LIBRARY_DIRS}
PATHS
${PC_HTMLCXX_LIBRARY_DIRS}
)
find_library(HTMLCXX_LIBRARY_CSS_PARSER css_parser
HINTS
${PC_HTMLCXX_LIBDIR}
${PC_HTMLCXX_LIBRARY_DIRS}
PATHS
${PC_HTMLCXX_LIBRARY_DIRS}
)
find_library(HTMLCXX_LIBRARY_CSS_PARSER_PP css_parser_pp
HINTS
${PC_HTMLCXX_LIBDIR}
${PC_HTMLCXX_LIBRARY_DIRS}
PATHS
${PC_HTMLCXX_LIBRARY_DIRS}
)
mark_as_advanced(HTMLCXX_INCLUDE_DIR HTMLCXX_LIBRARY_HTMLCXX HTMLCXX_LIBRARY_CSS_PARSER HTMLCXX_LIBRARY_CSS_PARSER_PP)
if(PC_HTMLCXX_FOUND)
set(Htmlcxx_FOUND ON)
set(Htmlcxx_INCLUDE_DIRS ${HTMLCXX_INCLUDE_DIR})
set(Htmlcxx_LIBRARIES ${HTMLCXX_LIBRARY_HTMLCXX} ${HTMLCXX_LIBRARY_CSS_PARSER} ${HTMLCXX_LIBRARY_CSS_PARSER_PP})
endif(PC_HTMLCXX_FOUND)

33
cmake/FindTidy.cmake Normal file
View File

@ -0,0 +1,33 @@
# - Try to find tidy
#
# Once done this will define
# Tidy_FOUND - System has tidy
# Tidy_INCLUDE_DIRS - The tidy include directories
# Tidy_LIBRARIES - The libraries needed to use tidy
find_package(PkgConfig)
pkg_check_modules(PC_TIDY tidy)
find_path(TIDY_INCLUDE_DIR tidy.h
HINTS
${PC_TIDY_INCLUDEDIR}
${PC_TIDY_INCLUDE_DIRS}
PATHS
${PC_TIDY_INCLUDE_DIRS}
)
find_library(TIDY_LIBRARY tidy
HINTS
${PC_TIDY_LIBDIR}
${PC_TIDY_LIBRARY_DIRS}
PATHS
${PC_TIDY_LIBRARY_DIRS}
)
mark_as_advanced(TIDY_INCLUDE_DIR TIDY_LIBRARY)
if(TIDY_INCLUDE_DIR)
set(Tidy_FOUND ON)
set(Tidy_INCLUDE_DIRS ${TIDY_INCLUDE_DIR})
set(Tidy_LIBRARIES ${TIDY_LIBRARY})
endif(TIDY_INCLUDE_DIR)

View File

@ -24,6 +24,7 @@
#include <json/json.h> #include <json/json.h>
#include <boost/date_time/posix_time/posix_time.hpp> #include <boost/date_time/posix_time/posix_time.hpp>
#include <curl/curl.h> #include <curl/curl.h>
#include <tinyxml2.h>
typedef struct typedef struct
{ {
@ -104,6 +105,8 @@ namespace Util
} }
Json::Value readJsonFile(const std::string& path); Json::Value readJsonFile(const std::string& path);
std::string transformGamename(const std::string& gamename); std::string transformGamename(const std::string& gamename);
std::string htmlToXhtml(const std::string& html);
tinyxml2::XMLNode* nextXMLNode(tinyxml2::XMLNode* node);
} }
#endif // UTIL_H #endif // UTIL_H

View File

@ -32,7 +32,6 @@ class Website
private: private:
CURL* curlhandle; CURL* curlhandle;
bool IsloggedInSimple(); bool IsloggedInSimple();
bool IsLoggedInComplex(const std::string& email);
std::map<std::string, std::string> getTagsFromJson(const Json::Value& json); std::map<std::string, std::string> getTagsFromJson(const Json::Value& json);
int retries; int retries;
std::string LoginGetAuthCode(const std::string& email, const std::string& password); std::string LoginGetAuthCode(const std::string& email, const std::string& password);

View File

@ -24,8 +24,6 @@
#include <boost/date_time/posix_time/posix_time.hpp> #include <boost/date_time/posix_time/posix_time.hpp>
#include <tinyxml2.h> #include <tinyxml2.h>
#include <json/json.h> #include <json/json.h>
#include <htmlcxx/html/ParserDom.h>
#include <htmlcxx/html/Uri.h>
#include <termios.h> #include <termios.h>
#include <algorithm> #include <algorithm>
#include <thread> #include <thread>
@ -1630,30 +1628,18 @@ std::string Downloader::getSerialsFromJSON(const Json::Value& json)
} }
else else
{ {
htmlcxx::HTML::ParserDom parser; std::string xhtml = Util::htmlToXhtml(cdkey);
tree<htmlcxx::HTML::Node> dom = parser.parseTree(cdkey); tinyxml2::XMLDocument doc;
tree<htmlcxx::HTML::Node>::iterator it = dom.begin(); doc.Parse(xhtml.c_str());
tree<htmlcxx::HTML::Node>::iterator end = dom.end(); tinyxml2::XMLNode* node = doc.FirstChildElement("html");
for (; it != end; ++it) while(node)
{ {
std::string tag_text; tinyxml2::XMLElement *element = node->ToElement();
if (it->tagName() == "span") const char* text = element->GetText();
{ if (text)
for (unsigned int j = 0; j < dom.number_of_children(it); ++j) serials << text << std::endl;
{
tree<htmlcxx::HTML::Node>::iterator span_it = dom.child(it, j);
if (!span_it->isTag() && !span_it->isComment())
tag_text = span_it->text();
}
}
if (!tag_text.empty()) node = Util::nextXMLNode(node);
{
boost::regex expression("^\\h+|\\h+$");
std::string text = boost::regex_replace(tag_text, expression, "");
if (!text.empty())
serials << text << std::endl;
}
} }
} }

View File

@ -11,10 +11,11 @@
#include <boost/iostreams/filter/gzip.hpp> #include <boost/iostreams/filter/gzip.hpp>
#include <boost/iostreams/filtering_streambuf.hpp> #include <boost/iostreams/filtering_streambuf.hpp>
#include <boost/iostreams/copy.hpp> #include <boost/iostreams/copy.hpp>
#include <tinyxml2.h>
#include <json/json.h> #include <json/json.h>
#include <fstream> #include <fstream>
#include <sys/ioctl.h> #include <sys/ioctl.h>
#include <tidy.h>
#include <tidybuffio.h>
/* /*
Create filepath from specified directory and path Create filepath from specified directory and path
@ -1007,3 +1008,49 @@ std::string Util::transformGamename(const std::string& gamename)
return gamename_transformed; return gamename_transformed;
} }
std::string Util::htmlToXhtml(const std::string& html)
{
std::string xhtml;
TidyBuffer buffer = {0, 0, 0, 0, 0};
int rc = -1;
TidyDoc doc = tidyCreate();
tidyOptSetBool(doc, TidyXhtmlOut, yes);
tidyOptSetBool(doc, TidyForceOutput, yes);
tidyOptSetInt(doc, TidyWrapLen, 0);
tidyOptSetInt(doc, TidyShowInfo, 0);
tidyOptSetInt(doc, TidyShowWarnings, 0);
rc = tidyParseString(doc, html.c_str());
if ( rc >= 0 )
rc = tidyCleanAndRepair(doc);
if ( rc >= 0 )
rc = tidySaveBuffer(doc, &buffer);
xhtml = std::string((char*)buffer.bp, buffer.size);
tidyBufFree(&buffer);
tidyRelease(doc);
return xhtml;
}
tinyxml2::XMLNode* Util::nextXMLNode(tinyxml2::XMLNode* node)
{
if (node->FirstChildElement()) // Has child element, go to first child
node = node->FirstChildElement();
else if (node->NextSiblingElement()) // Has sibling element, go to first sibling
node = node->NextSiblingElement();
else
{
// Go to parent node until it has sibling
while(node->Parent() && !node->Parent()->NextSiblingElement())
node = node->Parent();
if(node->Parent() && node->Parent()->NextSiblingElement())
node = node->Parent()->NextSiblingElement();
else // Reached the end
node = nullptr;
}
return node;
}

View File

@ -8,8 +8,8 @@
#include "globalconstants.h" #include "globalconstants.h"
#include "message.h" #include "message.h"
#include <htmlcxx/html/ParserDom.h>
#include <boost/algorithm/string/case_conv.hpp> #include <boost/algorithm/string/case_conv.hpp>
#include <tinyxml2.h>
#ifdef USE_QT_GUI_LOGIN #ifdef USE_QT_GUI_LOGIN
#include "gui_login.h" #include "gui_login.h"
@ -358,23 +358,27 @@ std::string Website::LoginGetAuthCodeCurl(const std::string& login_form_html, co
std::string tagname_username = "login[username]"; std::string tagname_username = "login[username]";
std::string tagname_password = "login[password]"; std::string tagname_password = "login[password]";
std::string tagname_login = "login[login]"; std::string tagname_login = "login[login]";
std::string tagname_token; std::string tagname_token = "login[_token]";
htmlcxx::HTML::ParserDom parser; std::string login_form_xhtml = Util::htmlToXhtml(login_form_html);
tree<htmlcxx::HTML::Node> login_dom = parser.parseTree(login_form_html);
tree<htmlcxx::HTML::Node>::iterator login_it = login_dom.begin(); tinyxml2::XMLDocument doc;
tree<htmlcxx::HTML::Node>::iterator login_it_end = login_dom.end(); doc.Parse(login_form_xhtml.c_str());
for (; login_it != login_it_end; ++login_it) tinyxml2::XMLNode* node = doc.FirstChildElement("html");
while(node)
{ {
if (login_it->tagName()=="input") tinyxml2::XMLElement *element = node->ToElement();
if (element->Name() && !std::string(element->Name()).compare("input"))
{ {
login_it->parseAttributes(); std::string name = element->Attribute("name");
if (login_it->attribute("id").second == "login__token") if (name == tagname_token)
{ {
token = login_it->attribute("value").second; // login token token = element->Attribute("value");
tagname_token = login_it->attribute("name").second; break;
} }
} }
node = Util::nextXMLNode(node);
} }
if (token.empty()) if (token.empty())
@ -419,25 +423,28 @@ std::string Website::LoginGetAuthCodeCurl(const std::string& login_form_html, co
std::string tagname_two_step_auth_letter_2 = "second_step_authentication[token][letter_2]"; std::string tagname_two_step_auth_letter_2 = "second_step_authentication[token][letter_2]";
std::string tagname_two_step_auth_letter_3 = "second_step_authentication[token][letter_3]"; std::string tagname_two_step_auth_letter_3 = "second_step_authentication[token][letter_3]";
std::string tagname_two_step_auth_letter_4 = "second_step_authentication[token][letter_4]"; std::string tagname_two_step_auth_letter_4 = "second_step_authentication[token][letter_4]";
std::string tagname_two_step_token; std::string tagname_two_step_token = "second_step_authentication[_token]";
std::string token_two_step; std::string token_two_step;
std::string two_step_html = this->getResponse(redirect_url); std::string two_step_html = this->getResponse(redirect_url);
redirect_url = ""; redirect_url = "";
tree<htmlcxx::HTML::Node> two_step_dom = parser.parseTree(two_step_html); std::string two_step_xhtml = Util::htmlToXhtml(two_step_html);
tree<htmlcxx::HTML::Node>::iterator two_step_it = two_step_dom.begin(); doc.Parse(two_step_xhtml.c_str());
tree<htmlcxx::HTML::Node>::iterator two_step_it_end = two_step_dom.end(); node = doc.FirstChildElement("html");
for (; two_step_it != two_step_it_end; ++two_step_it) while(node)
{ {
if (two_step_it->tagName()=="input") tinyxml2::XMLElement *element = node->ToElement();
if (element->Name() && !std::string(element->Name()).compare("input"))
{ {
two_step_it->parseAttributes(); std::string name = element->Attribute("name");
if (two_step_it->attribute("id").second == "second_step_authentication__token") if (name == tagname_two_step_token)
{ {
token_two_step = two_step_it->attribute("value").second; // two step token token_two_step = element->Attribute("value");
tagname_two_step_token = two_step_it->attribute("name").second; break;
} }
} }
node = Util::nextXMLNode(node);
} }
std::cerr << "Security code: "; std::cerr << "Security code: ";
@ -568,52 +575,6 @@ bool Website::IsLoggedIn()
return this->IsloggedInSimple(); return this->IsloggedInSimple();
} }
/* Complex login check. Check login by checking email address on the account settings page.
returns true if we are logged in
returns false if we are not logged in
*/
bool Website::IsLoggedInComplex(const std::string& email)
{
bool bIsLoggedIn = false;
std::string html = this->getResponse("https://www.gog.com/account/settings/security");
std::string email_lowercase = boost::algorithm::to_lower_copy(email); // boost::algorithm::to_lower does in-place modification but "email" is read-only so we need to make a copy of it
htmlcxx::HTML::ParserDom parser;
tree<htmlcxx::HTML::Node> dom = parser.parseTree(html);
tree<htmlcxx::HTML::Node>::iterator it = dom.begin();
tree<htmlcxx::HTML::Node>::iterator end = dom.end();
dom = parser.parseTree(html);
it = dom.begin();
end = dom.end();
for (; it != end; ++it)
{
if (it->tagName()=="strong")
{
it->parseAttributes();
if (it->attribute("class").second == "settings-item__value settings-item__section")
{
for (unsigned int i = 0; i < dom.number_of_children(it); ++i)
{
tree<htmlcxx::HTML::Node>::iterator tag_it = dom.child(it, i);
if (!tag_it->isTag() && !tag_it->isComment())
{
std::string tag_text = boost::algorithm::to_lower_copy(tag_it->text());
if (tag_text == email_lowercase)
{
bIsLoggedIn = true; // We are logged in
break;
}
}
}
}
}
if (bIsLoggedIn) // We are logged in so no need to go through the remaining tags
break;
}
return bIsLoggedIn;
}
/* Simple login check. Check login by trying to get account page. If response code isn't 200 then login failed. /* Simple login check. Check login by trying to get account page. If response code isn't 200 then login failed.
returns true if we are logged in returns true if we are logged in
returns false if we are not logged in returns false if we are not logged in