DOM traversal using headless Mozilla

Project

Build

  • .bashrc
    export MOZILLA_PATH="/path/to/mozilla/dist"
    export MOZILLA_SDK_PATH="$MOZILLA_PATH/sdk"
    export MOZILLA_FIVE_HOME="$MOZILLA_PATH/bin"
    export LD_LIBRARY_PATH="$MOZILLA_PATH/lib:$LD_LIBRARY_PATH"
    
  • offscreen/embedding/browser/headless/src/moz-headless.h
    • embedding/browser/headless/src/moz-headless.h

      diff -r 7d8630909bd6 embedding/browser/headless/src/moz-headless.h
      a b  
      106106  (G_TYPE_INSTANCE_GET_CLASS ((obj), \ 
      107107  MOZ_TYPE_HEADLESS, MozHeadlessClass)) 
      108108 
       109class nsIDOMDocument; 
       110 
      109111typedef struct { 
      110112  gint x; 
      111113  gint y; 
       
      254256MOZHEADLESS_API(void,   moz_headless_get_document_size,(MozHeadless *headless, 
      255257                                                        gint *width, 
      256258                                                        gint *height)) 
       259MOZHEADLESS_API(void,   moz_headless_get_dom_document, (MozHeadless *headless, 
       260                                                        nsIDOMDocument **document 
       261            )) 
      257262MOZHEADLESS_API(void,   moz_headless_freeze_updates,   (MozHeadless *headless, 
      258263                                                        gboolean frozen)) 
      259264MOZHEADLESS_API(void,   moz_headless_invalidate,       (MozHeadless *headless, 
  • offscreen/embedding/browser/headless/src/moz-headless.cpp
    • embedding/browser/headless/src/moz-headless.cpp

      diff -r 7d8630909bd6 embedding/browser/headless/src/moz-headless.cpp
      a b  
      5151#include "nsIWidget.h" 
      5252 
      5353#include "nsIDOMWindowInternal.h" 
       54#include "nsIDOMDocument.h" 
      5455#include "nsIIOService.h" 
      5556#include "nsIWindowWatcher.h" 
      5657#include "nsGUIEvent.h" 
       
      11641165} 
      11651166 
      11661167void 
       1168moz_headless_get_dom_document (MozHeadless *headless, nsIDOMDocument **document) 
       1169{ 
       1170    MozHeadlessPrivate *priv; 
       1171       
       1172    g_return_if_fail(headless != NULL); 
       1173    g_return_if_fail(MOZ_IS_HEADLESS(headless)); 
       1174       
       1175    priv = headless->priv; 
       1176 
       1177    // get the web browser 
       1178    nsCOMPtr<nsIWebBrowser> webBrowser; 
       1179    priv->priv->mWindow->GetWebBrowser(getter_AddRefs(webBrowser)); 
       1180 
       1181    // get the content DOM window for that web browser 
       1182    nsCOMPtr<nsIDOMWindow> domWindow; 
       1183    webBrowser->GetContentDOMWindow(getter_AddRefs(domWindow)); 
       1184 
       1185    //  get the dom document 
       1186    nsCOMPtr<nsIDOMDocument> domDocument; 
       1187    domWindow->GetDocument(getter_AddRefs(domDocument)); 
       1188 
       1189    if (!domDocument) { 
       1190        *document = nsnull; 
       1191    } else { 
       1192        CallQueryInterface(domDocument, document); 
       1193    } 
       1194} 
       1195 
       1196void 
      11671197moz_headless_freeze_updates (MozHeadless *headless, gboolean frozen) 
      11681198{ 
      11691199  MozHeadlessPrivate *priv; 
  • Makefile
    MOZ_PATH        = `echo $$MOZILLA_PATH`
    MOZ_SDK_PATH    = `echo $$MOZILLA_SDK_PATH`
    
    CPPFLAGS += -fPIC \
                -fno-rtti \
                -fno-exceptions \
                -fno-inline \
                -Wall \
                -Wpointer-arith \
                -Woverloaded-virtual \
                -Wsynth \
                -Wno-ctor-dtor-privacy \
                -Wno-non-virtual-dtor \
                -Wcast-align \
                -Wno-invalid-offsetof \
                -Wno-long-long \
                -pedantic \
                -fno-strict-aliasing \
                -fshort-wchar \
                -pthread \
                -pipe
    
    GECKO_INCLUDES = -I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include \
                     -I$(MOZ_SDK_PATH)/include -I$(MOZ_PATH)/include \
                     -I$(MOZ_PATH)/include/cairo
    LDLIBS  = -lcairo -lglib-2.0 -lxul -lsqlite3 -lmozjs -lsoftokn3 \
              -lnssutil3 -lnss3 -lnspr4 -lxpcom
    LDFLAGS = -L$(MOZ_SDK_PATH)/lib -L$(MOZ_PATH)/lib -L$(MOZ_PATH)/bin
    
    
    all: traversal
    
    clean:
        rm traversal 
    
    traversal: traversal.cpp
        $(CXX) -o $@ $< $(GECKO_INCLUDES) $(CPPFLAGS) $(LDFLAGS) $(LDLIBS) \
            $(MOZ_PATH)/lib/libxpcomglue_s.a
    
  • traversal.cpp
    #include <iostream>
    #include <iomanip>
    
    #include <glib.h>
    #include <signal.h>
    #include <cairo.h>
    #include <moz-headless.h>
    
    #include "nsCOMPtr.h"
    #include "nsDOMString.h"
    #include "nsServiceManagerUtils.h"
    
    #include "nsIDOMDocument.h"
    #include "nsIDOMElement.h"
    #include "nsIDOMNodeList.h"
    #include "inIDOMUtils.h"
    #include "nsIDOMCSSRule.h"
    #include "nsISupportsArray.h"
    
    #define WIDTH 800
    #define HEIGHT 600
    
    using namespace std;
    
    static MozHeadless *headless;
    static GMainLoop *mainloop;
    static gboolean resized = FALSE;
    static cairo_surface_t *surface;
    
    void walkDOM(nsIDOMElement *element, nsIDOMDocument *document, PRUint32 level);
    void ouputPrefix(PRUint32 level);
    
    static void net_stop_cb(MozHeadless *headless) {
        gint width, height;
        moz_headless_get_document_size(headless, &width, &height);
        surface = cairo_image_surface_create(CAIRO_FORMAT_ARGB32, width, height);
        moz_headless_set_surface(headless,
                                 cairo_image_surface_get_data(surface),
                                 width,
                                 height,
                                 width*4);
        moz_headless_set_size(headless, width, height);
        resized = TRUE;
    }
    
    static void updated_cb(MozHeadless *headless,
                           gint x,
                           gint y,
                           gint width,
                           gint height) {
        nsCOMPtr<nsIDOMDocument> document;
        moz_headless_get_dom_document(headless, getter_AddRefs(document));
        // Get root element
        nsCOMPtr<nsIDOMElement> rootElem;
        document->GetDocumentElement(getter_AddRefs(rootElem));
    
        if (rootElem) {
            walkDOM(rootElem, document, 0);
        }
    
        if (resized) g_main_loop_quit (mainloop);
    }
    
    static void leave(int sig) {
        net_stop_cb(headless);
    }
    
    void walkDOM(nsIDOMElement *element, nsIDOMDocument *document, PRUint32 level) {
        nsCOMPtr<nsIDOMNode> childNode;
        nsCOMPtr<nsIDOMNode> siblingNode;
    
        // Get node local name
        nsAutoString localName;
        element->GetLocalName(localName);
        ouputPrefix(level);
        cout << "<" << NS_ConvertUTF16toUTF8(localName).get() << "> Style:" << endl;
    
        // Get node style
        nsAutoString style;
        element->GetAttribute(NS_LITERAL_STRING("style"), style);
        if (style.Length()) {
            ouputPrefix(level);
            cout << NS_ConvertUTF16toUTF8(style).get() << endl;
        }
    
        // Get node CSS declaration
        nsCOMPtr<inIDOMUtils> domUtils =
            do_GetService("@mozilla.org/inspector/dom-utils;1");
    
        nsCOMPtr<nsISupportsArray> rules;
        domUtils->GetCSSStyleRules(element, getter_AddRefs(rules));
        PRUint32 count;
        rules->Count(&count);
        for (PRUint32 i = 0; i < count; ++i) {
            nsAutoString dom_style;
    
            nsCOMPtr<nsIDOMCSSRule> domRule =
                do_QueryInterface(rules->ElementAt(i));
    
            domRule->GetCssText(dom_style);
            ouputPrefix(level);
            cout << NS_ConvertUTF16toUTF8(dom_style).get() << endl;
        }
    
        // Child nodes traversal
        element->GetFirstChild(getter_AddRefs(childNode));
        while (childNode) {
            nsCOMPtr<nsIDOMElement> childElem = do_QueryInterface(childNode);
    
            // Recursion
            if (childElem) {
                walkDOM(childElem, document, level+1);
            }
    
            childNode->GetNextSibling(getter_AddRefs(siblingNode));
            childNode = siblingNode;
        }
    }
    
    void ouputPrefix(PRUint32 level) {
        cout << setfill(' ') << setw(level*4) << "";
    }
    
    int main(int argc, char **argv) {
        if (argc < 2) {
            cout << "Usage: " << argv[0] << " <url>" << endl;
            return 1;
        }
    
        g_type_init();
        mainloop = g_main_loop_new(NULL, FALSE);
        headless = moz_headless_new();
        moz_headless_set_size(headless, WIDTH, HEIGHT);
    
        g_signal_connect(headless, "net-stop", G_CALLBACK(net_stop_cb), NULL);
        g_signal_connect(headless, "updated", G_CALLBACK (updated_cb), NULL);
    
        moz_headless_load_url(headless, argv[1]);
        signal(SIGINT, leave);
        g_main_loop_run(mainloop);
    
        return 0;
    }