diff options
author | sijanec <anton@sijanec.eu> | 2021-04-01 23:30:37 +0200 |
---|---|---|
committer | sijanec <anton@sijanec.eu> | 2021-04-01 23:30:37 +0200 |
commit | 579048eaf89784ec1da8592d96311fafd49aea1a (patch) | |
tree | 61bf0c50c656f2b16ed8901ec3b07fb468ffb916 /src | |
download | sear.c-579048eaf89784ec1da8592d96311fafd49aea1a.tar sear.c-579048eaf89784ec1da8592d96311fafd49aea1a.tar.gz sear.c-579048eaf89784ec1da8592d96311fafd49aea1a.tar.bz2 sear.c-579048eaf89784ec1da8592d96311fafd49aea1a.tar.lz sear.c-579048eaf89784ec1da8592d96311fafd49aea1a.tar.xz sear.c-579048eaf89784ec1da8592d96311fafd49aea1a.tar.zst sear.c-579048eaf89784ec1da8592d96311fafd49aea1a.zip |
Diffstat (limited to '')
-rw-r--r-- | src/api.c | 163 | ||||
-rw-r--r-- | src/hp.html | 49 | ||||
-rw-r--r-- | src/i18n.h | 0 | ||||
-rw-r--r-- | src/lib.c | 33 | ||||
-rw-r--r-- | src/log.c | 60 | ||||
-rw-r--r-- | src/main.c | 43 | ||||
-rw-r--r-- | src/structs.c | 118 | ||||
-rw-r--r-- | src/url.c | 30 |
8 files changed, 496 insertions, 0 deletions
diff --git a/src/api.c b/src/api.c new file mode 100644 index 0000000..ae8d619 --- /dev/null +++ b/src/api.c @@ -0,0 +1,163 @@ +#define SC_CAPI(c, b, h, e, ...) sc_api(c, b, h, 0##__VA_OPT__(1), e __VA_OPT__(,) __VA_ARGS__) +#define SC_CAPIX(c, b, h, e, ...) sc_capix(c, b, h, 0##__VA_OPT__(1), e __VA_OPT__(,) __VA_ARGS__) +char * sc_api (struct sc_cache * c, char * body, char * headers, int isfmt, char * endpoint, ...) { + if (!c || !endpoint) + return NULL; + size_t va_count = parse_printf_format(endpoint, 0, NULL); + char * endpoint_formatted = NULL; + long response_code = 0; + if (isfmt && va_count > 0 && endpoint_formatted == NULL) { + va_list ap, ap2; + va_start(ap, endpoint); + va_copy(ap2, ap); + size_t strlenm = vsnprintf(NULL, 0, endpoint, ap); + endpoint_formatted = malloc(sizeof(char)*strlenm+1); + vsnprintf(endpoint_formatted, strlenm+1, endpoint, ap2); + va_end(ap); + va_end(ap2); + } + if (!headers) + headers = ""; + char * hedrs = malloc(sizeof(char)*strlen(headers)+strlen(SC_HTTP_HEADERS)+1); + strcpy(hedrs, SC_HTTP_HEADERS); + strcat(hedrs, headers); + char * contentType = NULL; + char * redir = NULL; + char * buf = malloc(sizeof(char)*SC_HTTP_RBUFSIZE); + size_t buf_sizeof = SC_HTTP_RBUFSIZE; + size_t buf_length = 0; + int readstatus = 0; + void * r = xmlNanoHTTPMethodRedir( + endpoint_formatted ? endpoint_formatted : endpoint, + body ? "POST" : "GET", + body, + &contentType, + &redir, + hedrs, + body ? strlen(body) : 0 + ); + if (!r) { + SC_LOG(SC_LOG_ERROR, c, "!r, endpoint: %s", endpoint_formatted ? endpoint_formatted : endpoint); + goto rc; + } + response_code = xmlNanoHTTPReturnCode(r); + if (!(response_code - 200 >= 0 && response_code - 200 < 100)) { + SC_LOG(SC_LOG_ERROR, c, "response_code == %ld, endpoint: %s", response_code, endpoint_formatted ? endpoint_formatted:endpoint); + } + while ((readstatus = xmlNanoHTTPRead(r, buf+buf_length, buf_sizeof-buf_length)) > 0) { + buf_length += readstatus; + if (buf_sizeof-buf_length < SC_HTTP_RBUFSIZE) { + buf_sizeof *= SC_REALLOC_K; + buf = realloc(buf, sizeof(char)*buf_sizeof); + } + } + if (readstatus == -1) + SC_LOG(SC_LOG_ERROR, c, "readstatus == -1, endpoint: %s", endpoint_formatted ? endpoint_formatted : endpoint); + xmlNanoHTTPClose(r); + SC_LOG(SC_LOG_DEBUG, c, "contentType = %s, redir = %s", contentType ? contentType : "NULL", redir ? redir : "NULL"); +rc: + free(endpoint_formatted); + free(contentType); + free(redir); + free(hedrs); + return buf; +} +htmlDocPtr sc_capix (struct sc_cache * c, char * body, char * headers, int isfmt, char * endpoint, ...) { + if (!c || !endpoint) + return NULL; + size_t va_count = parse_printf_format(endpoint, 0, NULL); + char * endpoint_formatted = NULL; + if (isfmt && va_count > 0 && endpoint_formatted == NULL) { + va_list ap, ap2; + va_start(ap, endpoint); + va_copy(ap2, ap); + size_t strlenm = vsnprintf(NULL, 0, endpoint, ap); + endpoint_formatted = malloc(sizeof(char)*strlenm+1); + vsnprintf(endpoint_formatted, strlenm+1, endpoint, ap2); + va_end(ap); + va_end(ap2); + } + char * buf = sc_api(c, body, headers, 0, endpoint_formatted ? endpoint_formatted : endpoint); + htmlDocPtr htmldoc = parseHtmlDocument(buf, endpoint_formatted ? endpoint_formatted : endpoint); + free(buf); + free(endpoint_formatted); + return htmldoc; +} +char * sc_find_class (char * haystack, const char * definition) { /* you must free class after calling */ + if (!haystack || !definition) + return NULL; + char * class = strstr(haystack, definition); + if (!class) + return NULL; + int found = 0; + for (; class > haystack; class--) + if (class[-1] == '.' && (found = 1)) + break; + if (!found) + return NULL; + char * endofclass = class; + found = 0; + for (; *endofclass; endofclass++) /* google only has alphanumeric class names. TODO: be pedantic and conformic to w3 stds */ + if (!isalnum(endofclass[0]) && (found = 1)) + break; + if (!found) + return NULL; + char * toreturn = malloc(endofclass-class+1); + strncpy(toreturn, class, endofclass-class); + toreturn[endofclass-class] = '\0'; + return toreturn; +} +int sc_query_google (char * s, struct sc_cache * c) { + /* + remarks: + * we are using wap.google.com over HTTP and with a user-agent string of a nokia mobile phone, so we get a lite website + * we determine which class holds a specific value by looking at the css definitions + - result title: the only class that has definition {color:#1967D2;font-size:14px;line-height:16px} + + A links have this class set, but they have a child SPAN element that then holds the text of the title + + A href points to a tracking relative link, starting with /url?q=. the q parameter contains the (obv urlencoded) link. + - result date: class has only one definition, {color:#70757a}, but same definition has the class for the settings A link. + + extract those two classes and find the one that is only present on SPAN text elements. + - result description: once we have the result div, the description is the // span with the appropriate class + + the appropriate class is the only one with {word-break:break-word}. note that this class also describes other elements. + - result div: to get the result div, we need the parent of the parent of the A link of the title. + * result dates are sometimes relative ("an hour ago") and heavily depend on the client location, based on IP. + - we won't parse those yet + * I couldn't find anything with ratings, so we won't parse thouse either yet + * captcha: google knows that this nokia phone we're pretending to be doesn't support javascript + - the request limiting captcha must work on a phone without javascript. it is probably loaded inside an iframe, but has + origin protection, so we can't just solve it client-side. we would have to proxy images and create some sort of a session + based http request-response based user interface so we can ask the user to complete the captcha. this is not yet + implemeted and will be hard work. + */ + if (!s || !c) + return -1; + int rs = 1; + char * us = malloc(sizeof(char)*strlen(s)*3+1); + urlencode(us, s); + char * txtdoc = SC_CAPI(c, NULL, NULL, "http://wap.google.com/search?q=%s", us); + // fprintf(stdout, "%s\n", txtdoc); + free(us); + if (!txtdoc) { + rs = -2; + goto rc; + } + char * titleclass = sc_find_class(txtdoc, "{color:#1967D2;font-size:14px;line-height:16px}"); + if (!titleclass) { + SC_LOG(SC_LOG_ERROR, c, "!titleclass"); + rs = -3; + goto rc; + } +#define SC_GTXF "/html/body//a[contains(@class, '%s')]" // @class='fuLhoc ZWRArf'" + char * xpath = malloc(strlen(titleclass)+strlen(SC_GTXF)); + sprintf(xpath, SC_GTXF, titleclass); /* whenever starts with titleclas */ + fprintf(stdout, "%s\n", xpath); + htmlDocPtr xmldoc = parseHtmlDocument(txtdoc, NULL); + xmlXPathObjectPtr nodes = findNodes(xmldoc, xpath); + eachNode(nodes, printLinkNode, NULL); +rc: + xmlFreeDoc(xmldoc); + free(txtdoc); + free(titleclass); + free(xpath); + return rs; +} diff --git a/src/hp.html b/src/hp.html new file mode 100644 index 0000000..64da49d --- /dev/null +++ b/src/hp.html @@ -0,0 +1,49 @@ +<!DOCTYPE html> +<html lang=sl> + <!-- this file is a printf format. be sure to escape percent signs with percent percent. --> + <!-- this format requires the following types (in order): query string, query string, result info string, results html string --> + <head> + <meta charset=UTF-8 /> + <title> + %s :: sear.c + </title> + <link rel=stylesheet href=//sijanec.eu/assets/css/styles.css?ref=sear.c /> <!-- TODO: direktno vstavljanje v dokument --> + <link rel="shortcut icon" href="data:image/x-icon;," type="image/x-icon"> <!-- prevents favicon lookups --> + <link rel="icon" href="data:;base64,iVBORw0KGgo="> + <style> + input[type=password], input[type=text], input[type=submit], input[type=button] { + width: 100%%; + height: 1,5cm; + font-size: 18; + } + input .125 { + width: 125%%; + } + input .50 { + width: 50%%; + } + .result:hover { + background: var(--bgc2); + } + </style> + </head> + <body> + <form> + <input type=text name=q class=50 value="{{ query }}" placeholder="sear.c ..." /> + <input type=submit class=125 value=🔍 /> <!-- magnifying glass emoji --> + <input type=submit class=125 name=f value=Ʊ /> <!-- horseshoe unicode character --> + <input type=submit class=125 name=i value=🖼 /> <!-- framed picture emoji --> + <input type=submit class=125 name=v value=🎬 /> <!-- that thing they use in movies to denote start of a scene emoji --> + </form> + <h3> + %s + </h3> + %s + <hr> + <h4 align=center> + <a href=//git.sijanec.eu/sijanec/sear.c > + sear.c + </a> + </h4> + </body> +</html> diff --git a/src/i18n.h b/src/i18n.h new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/src/i18n.h diff --git a/src/lib.c b/src/lib.c new file mode 100644 index 0000000..2c3e34a --- /dev/null +++ b/src/lib.c @@ -0,0 +1,33 @@ +static htmlDocPtr parseHtmlDocument(const char * d, const char * b /* base url */) { + if (!b) + b = ""; + htmlParserCtxtPtr parser_context = htmlNewParserCtxt(); + htmlDocPtr document = htmlCtxtReadMemory(parser_context, d, strlen(d), b, NULL /* encoding */, HTML_PARSE_NOWARNING | HTML_PARSE_NOERROR | HTML_PARSE_RECOVER); + htmlFreeParserCtxt(parser_context); + return document; +} +static xmlXPathObjectPtr findNodes(htmlDocPtr document, const char * xpath_query) { + xmlXPathContextPtr xpath_ctx = xmlXPathNewContext(document); + xmlXPathObjectPtr nodes = xmlXPathEvalExpression(BAD_CAST xpath_query, xpath_ctx); + xmlXPathFreeContext(xpath_ctx); + return nodes; +} +typedef void (*node_function_t)(xmlNodePtr node, void * data); +static void eachNode(xmlXPathObjectPtr nodes, node_function_t f, void * data) { + xmlNodeSetPtr nodeset = nodes->nodesetval; + int i, size = nodeset->nodeNr; + for (i = 0; i < size; i++) { + xmlNodePtr cur; + cur = (xmlNodePtr)nodeset->nodeTab[i]; + f(cur, data); + } +} +void printLinkNode(xmlNodePtr node, void * data) { + if (node->type == XML_ELEMENT_NODE) { + xmlAttrPtr href = xmlHasProp(node, BAD_CAST "href"); + if (href) { + printf("-> Link to '%s'\n", xmlGetProp(node, BAD_CAST "href")); + } + } +} + diff --git a/src/log.c b/src/log.c new file mode 100644 index 0000000..d229512 --- /dev/null +++ b/src/log.c @@ -0,0 +1,60 @@ +const char * sc_log_str (int t) { + switch (t) { + case SC_LOG_ERROR: + return "SC_LOG_ERROR"; + case SC_LOG_WARNING: + return "SC_LOG_WARNING"; + case SC_LOG_INFO: + return "SC_LOG_INFO"; + case SC_LOG_DEBUG: + return "SC_LOG_DEBUG"; + default: + return "SC_LOG_UNKNOWN"; + } + /* interestingly, gcc figures out there's no way for code to reach this section, therefore there's no warning "-Wreturn-type" */ +} +int sc_logentry_free (struct sc_logentry * l) { + free(l->message); l->message = NULL; + free(l); + return 1; +} +struct sc_logentry * sc_logentry_init () { + struct sc_logentry * l = calloc(1, sizeof(struct sc_logentry)); + return l; +} +int sc_push_log (unsigned char t, struct sc_cache * c, const char * ca, char * f, size_t l, unsigned short int isf, char * m, ...) { +#define SC_PLL c->logentries[c->logentries_length-1] + if (!c) + return -1; + pthread_rwlock_t * lock = c->logentries_lock; + if (!lock) + return -2; + if (pthread_rwlock_wrlock(lock)) + return -3; + if (c->logentries_sizeof - c->logentries_length != 0) + SC_BIGGER_ARRAY(c->logentries, sc_logentry); + c->logentries_length++; + size_t strlenm = strlen(m); + size_t va_count = parse_printf_format(m, 0, NULL); + if (isf && va_count > 0) { + va_list ap, ap2; + va_start(ap, m); + va_copy(ap2, ap); + strlenm = vsnprintf(NULL, 0, m, ap); + SC_PLL->message = malloc(sizeof(char)*strlenm+1); + vsnprintf(SC_PLL->message, strlenm+1, m, ap2); + va_end(ap); + va_end(ap2); + } else { + SC_PLL->message = malloc(sizeof(char)*strlenm+1); + strcpy(SC_PLL->message, m); + } + SC_PLL->file = f; + SC_PLL->line = l; + SC_PLL->function = ca; + SC_PLL->time = time(NULL); + fprintf(stderr, "[sear.c] %s %s()@%s:%lu: %s\n", sc_log_str(t), ca, f, l, SC_PLL->message); /* in posix, this is thread safe */ + if (lock && pthread_rwlock_unlock(lock)) + return -4; + return 1; +} diff --git a/src/main.c b/src/main.c new file mode 100644 index 0000000..c42d5c0 --- /dev/null +++ b/src/main.c @@ -0,0 +1,43 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <stdarg.h> +#include <printf.h> +#include <ctype.h> +#include <time.h> +#include <pthread.h> +#include <stdatomic.h> +#include <i18n.h> +#include <sys/types.h> +#include <sys/select.h> +#include <sys/socket.h> +#include <libxml/nanohttp.h> +#include <libxml/HTMLparser.h> +#include <libxml/HTMLtree.h> +#include <libxml/tree.h> +#include <libxml/xpath.h> +#include <lib.c> +#include <url.c> +unsigned char sc_hp[] = { /* html page null terminated format string, from file src/hp.html */ +#include <hp.xxd> +}; +#define SC_HTTP_PORT 7327 /* SEAR on mobile keyboard */ +#define SC_HTTP_RBUFSIZE 4096 /* initial size of http read buffer, increasning by K */ +#define SC_HTTP_USER_AGENT "Nokia WAP Gateway 4.1 CD1/ECD13_D/4.1.04)" /* so google and others sends a minimal response */ +#define SC_HTTP_HEADERS "User-Agent: " SC_HTTP_USER_AGENT "\r\n" +#include <structs.c> +#include <log.c> +#include <api.c> +/* this is new in my programs. I am now using _sizeof for the actual alloced size of the array and _length for the count of elements in array. this is done to decrease number of calls to realloc&amis */ +int main (int argc, char ** argv) { + int rs = 0; + struct sc_cache * c = sc_cache_init(); + if (!c) { + rs = 1; + goto rc; + } + sc_query_google("slovenia", c); +rc: + sc_cache_free(c); + return rs; +} diff --git a/src/structs.c b/src/structs.c new file mode 100644 index 0000000..fa4228c --- /dev/null +++ b/src/structs.c @@ -0,0 +1,118 @@ +#define SC_ALLOC_CHUNK 128 /* how many x to alloc when allocing (for performance so we don't call malloc over and over again) */ +#define SC_IN_STRUCT_ARRAY(type, name) _Atomic(type **) name; _Atomic(size_t) name##_sizeof; _Atomic(size_t) name##_length +#define SC_CWLE(c, name) (pthread_rwlock_wrlock(name) ? (SC_LOG(SC_LOG_ERROR,c,SC_I18N_LOCKING " " #name " " SC_I18N_FAILED)||1) :0) +#define SC_CRLE(c, name) (pthread_rwlock_rdlock(name) ? (SC_LOG(SC_LOG_ERROR,c,SC_I18N_LOCKING " " #name " " SC_I18N_FAILED)||1) :0) +#define SC_CUE(c, name) (pthread_rwlock_unlock(name) ? (SC_LOG(SC_LOG_ERROR,c,SC_I18N_UNLOCKING " " #name " " SC_I18N_FAILED)||1):0) +#define SC_REALLOC_K 1.5 /* constant to dynamically realloc large arrays (new size = current size * K) */ +#define SC_ENGINE_GOOGLE (1 << 0) +/* _Atomic(size_t) sc_mem_max = 100e6; */ /* the really soft memory limit of the program: 100MB - NOT IMPLEMENTED */ +#define SC_LOG(t, c, m, ...) sc_push_log(t, c, __func__, __FILE__, __LINE__, 0##__VA_OPT__(1), m __VA_OPT__(,) __VA_ARGS__) +#define SC_LOG_ERROR (1 << 0) +#define SC_LOG_WARNING (1 << 1) +#define SC_LOG_INFO (1 << 2) +#define SC_LOG_DEBUG (1 << 3) +#define SC_BIGGER_ARRAY(name, type) do { \ + name = realloc(name, sizeof(name[0])*name##_sizeof*SC_REALLOC_K); \ + for (size_t i = name##_sizeof; i < name##_sizeof*SC_REALLOC_K; i++) \ + name[i] = type##_init(); \ + name##_sizeof = name##_sizeof*SC_REALLOC_K; \ + } while (0); +struct sc_logentry { + unsigned char type; /* SC_LOG_ERROR, SC_LOG_WARNING, SC_LOG_INFO, SC_LOG_DEBUG */ + size_t line; + const char * function; /* nofree */ + char * file; /* nofree */ + char * message; /* yesfree */ + time_t time; +}; +int sc_logentry_free (struct sc_logentry * l); /* defined in log.c */ +struct sc_logentry * sc_logentry_init (); /* defined in log.c */ + +struct sc_result { + struct sc_query * query; /* nofree - free from sc_cache */ + char * url; /* yesfree */ + char * desc; /* yesfree */ + char * title; /* yesfree */ + time_t date; /* some search engines like to extract a date from a website, store that here */ + char * html; /* yesfree - cached generated html output of said result or NULL before it's created */ + unsigned short int rating; /* some search engines like to extract a rating from a website, store that here */ /* not implementd */ + unsigned short int rating_max; /* max rating when above is used /\ */ /* not implemented yet */ +}; +struct sc_result * sc_result_init () { + struct sc_result * r = calloc(1, sizeof(struct sc_result)); + return r; +} +int sc_result_free (struct sc_result * r) { + if (!r) + return -1; + free(r->url); + free(r->desc); + free(r->title); + free(r->html); + free(r); + return 1; +} +struct sc_query { + struct sc_cache * cache; /* nofree - what cache owns this query */ + SC_IN_STRUCT_ARRAY(struct sc_result, results); /* yesfree */ + char * string; /* yesfree - query string, stripped of any excess characters that should be excluded from indexing */ + time_t lookup_time; /* time of last lookup */ + unsigned char engines; /* with what engine(s) was the query done - bitmask - if there are results from multiple engines */ + char * html; /* yesfree - cached generated HTML output of the result or NULL before it's created */ +}; +struct sc_query * sc_query_init () { + struct sc_query * q = calloc(1, sizeof(struct sc_query)); + q->results_sizeof = SC_ALLOC_CHUNK; + q->results = calloc(q->results_sizeof, sizeof(struct sc_result *)); + for (size_t i = 0; i < q->results_sizeof; i++) { + q->results[i] = sc_result_init(); + q->results[i]->query = q; + } + return q; +} +int sc_query_free (struct sc_query * q) { + if (!q) + return -1; + free(q->string); /* if they were not alloced, they are NULL, if they were free'd somewhere else, they are also set to NULL */ + free(q->html); /* setting to NULL here is not necessary, as we'll never use this query struct again */ + for (size_t i = 0; i < q->results_sizeof; i++) + sc_result_free(q->results[i]); + free(q); + return 1; +} +struct sc_cache { + SC_IN_STRUCT_ARRAY(struct sc_query, queries); /* yesfree */ + pthread_rwlock_t * queries_lock; + SC_IN_STRUCT_ARRAY(struct sc_logentry, logentries); /* yesfree */ + pthread_rwlock_t * logentries_lock; +}; +struct sc_cache * sc_cache_init() { + struct sc_cache * c = calloc(1, sizeof(struct sc_cache)); + c->queries_sizeof = SC_ALLOC_CHUNK; + c->logentries_sizeof = SC_ALLOC_CHUNK; + c->queries = calloc(c->queries_sizeof, sizeof(struct sc_query *)); + c->logentries = calloc(c->logentries_sizeof, sizeof(struct sc_logentry *)); + for (size_t i = 0; i < c->queries_sizeof; i++) { + c->queries[i] = sc_query_init(); + c->queries[i]->cache = c; + c->logentries[i] = sc_logentry_init(); + } +#define SC_CILI(name) do { name##_lock = malloc(sizeof(pthread_rwlock_t)); pthread_rwlock_init(name##_lock, NULL); } while (0) + SC_CILI(c->queries); + SC_CILI(c->logentries); + return c; +} +int sc_cache_free(struct sc_cache * c) { + if (!c) + return -1; + for (size_t i = 0; i < c->queries_sizeof; i++) + sc_query_free(c->queries[i]); + free(c->queries); + for (size_t i = 0; i < c->logentries_sizeof; i++) + sc_logentry_free(c->logentries[i]); + #define SC_CFLD(name) do { pthread_rwlock_destroy(name##_lock); free(name##_lock); } while(0) + SC_CFLD(c->queries); + SC_CFLD(c->logentries); + free(c); + return 1; +} diff --git a/src/url.c b/src/url.c new file mode 100644 index 0000000..a3a29e7 --- /dev/null +++ b/src/url.c @@ -0,0 +1,30 @@ +int urlencode (char * o, char * i /* o must have at least strlen(i)*3+1 bytes of memory allocated */) { + size_t written = 0; + for (; *i; i++) { + if (isalnum(*i) || *i == '.' || *i == '_' || *i == '-' || *i == '~') { + o[written++] = *i; + } else { + sprintf(o+written, "%%%02X", *i); + written += 3; + } + } + return 1; +} +int urldecode (char * o, char * i /* o must have at least strlen(i)+1 bytes memory allocated */) { + size_t written = 0; + char buf[] = "00"; + for (; *i; i++) { + if (*i == '%') { + buf[0] = *++i; + buf[1] = *++i; + if (!buf[0] || !buf[1]) { /* malformed */ + o[written++] = '\0'; + return 0; + } + o[written++] = strtol(buf, NULL, 16); + } else { + o[written++] = *i; + } + } + return 1; +} |