diff options
Diffstat (limited to '')
-rw-r--r-- | src/api.c | 40 |
1 files changed, 32 insertions, 8 deletions
@@ -107,6 +107,22 @@ char * sc_find_class (char * haystack, const char * definition) { /* you must fr toreturn[endofclass-class] = '\0'; return toreturn; } +int sc_fix_url (char ** h) { /* fixes a (result) URL in-place (removes tracking nonsense, so resulting URL is shorter or equl) */ + if (!h || !*h) /* stage 0: prevent accidental death */ + return -1; + if (!strncmp(*h, "/url?q=", strlen("/url?q="))) { /* stage 1: url may be tracking url by google results */ + *h = *h+strlen("/url?q="); + *strchrnul(*h, '&') = '\0'; + urldecode(*h, *h); + } + char * c = NULL; + if ((c = strstr(*h, "googleweblight.com/fp?u="))) { /* stage 2: url may be "light web" tracking url by google results */ + *h = c+strlen("googleweblight.com/fp?u="); /* we could disable this with a cookie but meh, this is easier and _stateless_ */ + *strchrnul(*h, '&') = '\0'; + urldecode(*h, *h); + } /* TODO: be pedantic and remove utm_source and other tracking bullshit */ + return 1; +} struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct sc_query * q) { /* check for cached queries first! */ /* query is in most cases NULL. then it will be allocated and put into sc_cache. otherwise response will be put into passed q. */ /* if query is not NULL, it MUST be initialized */ @@ -146,7 +162,7 @@ struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct s char * xpath = NULL; char * descclass = NULL; char * titleclass = NULL; - char * txtdoc = SC_CAPI(c, NULL, NULL, "http://wap.google.com/search?q=%s", us); + char * txtdoc = SC_CAPI(c, NULL, NULL, "http://wap.google.com/search?q=%s&num=100", us); // fprintf(stdout, "%s\n", txtdoc); free(us); if (!txtdoc) { @@ -162,7 +178,8 @@ struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct s goto rc; } #define SC_GTXF "/html/body//a[contains(@class, '%s')]" /* result a */ -#define SC_GTXD "../..//table//span[@class='%s']" +#define SC_GTXD /* description */ "../..//table//span[@class='%s']" +#define SC_GTXB /* breadcrumbs */ ".//span[@class='%s']" #define SC_GTR q->results[q->results_length-1] xpath = malloc(strlen(titleclass)+strlen(SC_GTXF)); sprintf(xpath, SC_GTXF, titleclass); /* whenever starts with titleclas */ @@ -177,16 +194,18 @@ struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct s if (node->type == XML_ELEMENT_NODE) { xmlAttrPtr href = xmlHasProp(node, BAD_CAST "href"); if (href) { - char * hreflink = (char *) xmlGetProp(node, BAD_CAST "href"); - if (!strncmp(hreflink, "/url?q=", strlen("/url?q="))) { - hreflink = hreflink+strlen("/url?q="); - *strchrnul(hreflink, '&') = '\0'; - urldecode(hreflink, hreflink); - } + char * hreflink = (char *) xmlGetProp(node, BAD_CAST "href"); /* fuck rules, I will rewrite it anyways <= hi future me */ + sc_fix_url(&hreflink); char * x = malloc(strlen(descclass)+strlen(SC_GTXD)); + char * xbread = malloc(strlen(descclass)+strlen(SC_GTXB)); sprintf(x, SC_GTXD, descclass /* remember, kids, GNU C is fucking legendary */); + sprintf(xbread, SC_GTXB, descclass /* remember, kids, GNU C is fucking legendary */); xmlNodePtr descnode = nthNodeXN(node, x, 0); + if (!descnode) /* description may be above, see https://support.google.com/websearch?p=featured_snippets */ + descnode = nthNodeXN(node, "../../div/div", 0); + xmlNodePtr breadnode = nthNodeXN(node, xbread, 0); free(x); + free(xbread); if (q->results_sizeof <= q->results_length) SC_BIGGER_ARRAY(q->results, sc_result); q->results_length++; @@ -206,6 +225,11 @@ struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct s SC_GTR->desc = malloc(strlen(SC_I18N_NO_DESCRIPTION)+1); strcpy(SC_GTR->desc, SC_I18N_NO_DESCRIPTION); } + SC_GTR->breadcrumbs = (char *) xmlNodeGetContent(breadnode); + if (!SC_GTR->breadcrumbs) { + SC_GTR->breadcrumbs = malloc(strlen(SC_GTR->url)+1); + strcpy(SC_GTR->breadcrumbs, SC_GTR->url); + } } } } |