You are not logged in.

#1 2018-03-15 20:06:09

teckk
Member
Registered: 2013-02-21
Posts: 518

Python QtWebEngine Web page Inspector

This is from a project that I am working on.
This script prints all requests a web page makes to shell and also
to file for later inspection. Since it uses a browser engine it gets
everything on the page. Usage would be similar to using webkit's
Inspector or Firefox's firebug. Something may not work right for all
pages. Needs Python3, qt5-base, qt5-webengine, pyqt5-common, python-pyqt5

myinsp.py

#! /usr/bin/env python

#Python QtWebEngine Web page Inspector.
#Saves html to MyInsp.html. Saves page requests to shell and to MyInsp.req
#Saves web cache at ~/.cache/myinsp
#Usage: <script.py> <url> | --disable-gpu for nouveau

import sys, os
from PyQt5.QtGui import QFont
from PyQt5.QtCore import Qt, QUrl, pyqtSignal
from PyQt5.QtNetwork import QNetworkCookie
from PyQt5.QtWidgets import QWidget, QHBoxLayout, QApplication
from PyQt5.QtWebEngineCore import QWebEngineUrlRequestInterceptor
from PyQt5.QtWebEngineWidgets import (QWebEnginePage, 
                                QWebEngineView, QWebEngineSettings)
#User agents, Desktop, Mobile
a = ('Mozilla/5.0 (Windows NT 10.0; WOW64; rv:57.0) '
            'Gecko/20100101 Firefox/57.0')
                        
b = ('Mozilla/5.0 (iPhone; CPU iPhone OS 10_0_1 like Mac OS X) '
            'AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 '
                'Mobile/14A403 Safari/602.1')

class BrowserReqIntercept(QWebEngineUrlRequestInterceptor):
    netS = pyqtSignal(str)    
    def __init__(self,parent,url,print_request,get_link,req_file):
        super(BrowserReqIntercept, self).__init__(parent)
        self.url = url
        self.print_request = print_request
        self.get_link = get_link
        self.req_file = req_file
        
    #Emit page requests urls
    def interceptRequest(self,info):
        t = info.requestUrl()
        urlLnk = t.url()
        if self.get_link:
            if self.get_link in urlLnk:
                self.netS.emit(urlLnk)
                
        #Print requests to shell, spaced
        if self.print_request:
            print('\n' + (urlLnk))
            
        #Write requests to file, spaced    
        rlist = []
        if self.req_file:
            rlist.append(urlLnk)
            for i in rlist:
                with open(self.req_file, 'a') as f:
                    f.write(i + '\n\n')
                
class BrowserPage(QWebEnginePage):  
    link_signal = pyqtSignal(str)
    link_received = pyqtSignal(str)
    def __init__(self,url,tmp_dir,html_file,print_request,timeout,
                tab_web,parent,get_link,req_file):
        super(BrowserPage, self).__init__()

        self.user_agent = (a) #Set user agent here
        self.tmp_dir = tmp_dir
        self.html_file = html_file
        self.req_file = req_file
        self.timeout = timeout
        self.tab_web = tab_web
        self.loadFinished.connect(self._loadFinished) #Connect to signals
        self.loadProgress.connect(self._loadProgress)
        self.loadStarted.connect(self._loadstart)
        
        reqs = BrowserReqIntercept(self,url,print_request,get_link,req_file)
        def urlnk():
            return 
        reqs.netS.connect(urlnk)
        self.link_received.connect(urlnk)
        self.profile().setHttpUserAgent(self.user_agent)
        self.profile().setRequestInterceptor(reqs)
        self.profile().setCachePath(self.tmp_dir)
        self.profile().setPersistentStoragePath(self.tmp_dir) 

    def _loadstart(self):
        return
        
    def htm_src(self,source):
        self.htmlout_file = source
            
    def _loadProgress(self):
        self.toHtml(self.htm_src)
        
    #Write html file after page load
    def write_html_file(self):
            with open(self.html_file,'wb') as f:
                f.write(self.htmlout_file.encode('utf-8'))
    
    def _loadFinished(self):
        print('\n'+'Logging to:'+' '+(str(self.html_file))+' '+
            (str(self.req_file)))
        self.write_html_file()
        if not self.timeout:
            exit(0)
            
class BrowserView(QWebEngineView):
    def __init__(self,url,tmp_dir,html_file,print_request,timeout,
                get_link,req_file):
        super(BrowserView, self).__init__()
        
        #Set font size, images on/off, scripts on/off
        self.settings().globalSettings().setFontSize(
                    QWebEngineSettings.MinimumFontSize, (22))
        self.settings().globalSettings().setAttribute(
                    QWebEngineSettings.AutoLoadImages, True)
        self.settings().globalSettings().setAttribute(
                    QWebEngineSettings.JavascriptEnabled, True)
        
        dm = self.url = url
        if self.url.startswith('http'):
            dm = self.url.split('/')[2]
        if dm.startswith('www.'):
            dm = dm.replace('www.','',1)
            
        self.domain_name = dm
        self.tmp_dir = tmp_dir
        self.html_file = html_file
        self.get_link = get_link
        self.print_request = print_request
        self.req_file = req_file
        self.timeout = timeout
        self.Browse(self.url)
        
    def get_window_object(self):
        return self.tab_web
    
    def start_loading(self):
        self.Browse(self.url)
        
    def gethtml(self):
        return self.web.htmlout_file
    
    def Browse(self,url):
        self.tab_web = QWidget()
        self.tab_web.setMinimumSize(1000,800) #Browser window size
        self.tab_web.show()
        self.tab_web.setWindowTitle(self.domain_name)
        self.horizontalLayout_5 = QHBoxLayout(self.tab_web)
        self.horizontalLayout_5.addWidget(self)
        self.web = BrowserPage(url,self.tmp_dir,self.html_file,
        self.print_request,self.timeout,self.tab_web,self,
        self.get_link,self.req_file)
        self.setPage(self.web)
        if self.url is not None:
            self.load(QUrl(url))
        QApplication.processEvents()

def main():
    app = QApplication(sys.argv)
    #Path for cache, if not exists make it.
    tmp_dir = os.path.join(os.path.expanduser('~'),'.cache','myinsp')
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)

    #Open with arguments or prompt for input, allow --disable-gpu for nouveau 
    if len(sys.argv) < 2:
        url = input('Enter/Paste url to Inspect: ')
    else:
        url = sys.argv[1]
    if url.startswith('--'):
        url = sys.argv[2]
  
    html_file = 'MyInspect.html'    # html outfile
    req_file = 'MyInspect.req'  # requests outfile
    print_request = True    # print requests to shell or not
    timeout = 1 # 1 for no exit after page load, 0 to exit
    
    if url.startswith('http'):
        domain_name = url.split('/')[2]
    else:
        domain_name = url
    domain_name = domain_name.replace('www.','',1)
    
    get_link = None
    web = BrowserView(url,tmp_dir,html_file,print_request,timeout,
        get_link,req_file)
            
    go = app.exec_()
    sys.exit(go)

if __name__ == "__main__":
    main()

Edit: Missed a dependency

Last edited by teckk (2018-03-15 21:37:18)

Offline

Board footer

Powered by FluxBB