gnowledge · ansuman-eng · Jul 11, 2018 · Jul 11, 2018 · Jul 11, 2018 · Jul 11, 2018
diff --git a/doc/deployer/new_parser_instruction.txt b/doc/deployer/new_parser_instruction.txt
@@ -0,0 +1,7 @@
+1. Script which caches the full-site.
+2. Basically a web-scraper that uses Beautiful Soup and Selenium for browser-automation.
+3. Since many tags were stored as JS objects which were rendered only on a browser request, we needed to add Selenium.
+4. Follows a DFS to save on memory.
+5. Links are scraped off each web-page and the local links are set up in the stack.
+6. Links previously visited and already in stack are left untouched.
+7. Full-job will need some down-time, so preferable to run at startup only.
diff --git a/doc/deployer/newparser.py b/doc/deployer/newparser.py
@@ -0,0 +1,51 @@
+#newparser
+import logging
+from selenium import webdriver
+from selenium.webdriver.firefox.options import Options
+from collections import deque
+from bs4 import BeautifulSoup
+
+url="http://doer.metastudio.org"
+stack=["/explore/courses"]
+visited=[]
+options=Options()
+options.add_argument("--headless")
+browser = webdriver.Firefox(firefox_options=options)
+
+
+while(len(stack)>0 and len(visited)<100000):
+    str=stack.pop()
+    visited.append(str)
+    #print(str)
+    print(url+str)
+    print(len(visited))
+
+    try:
+    	browser.get(url+str)
+    	html=browser.page_source
+    except Exception as e:
+    	logging.exception(e)
+    	continue
+
+
+    if html is not None:
+    	soup=BeautifulSoup(html,'html.parser')
+    else:
+    	continue
+
+    for link in soup.find_all('a'):
+            target = link.get('href')
+            print(target)
+            try:
+                if ((target not in visited) and (target not in stack) and type(target) == type("bat") and target[0] == '/'):
+                    #print(target)
+                    if (".gif" in target or ".jpg" in target or ".png" in target or "/accounts/login" in target):
+                        continue
+                    stack.append(target)
+            except:
+                pass
+
+
+
+
+
diff --git a/doc/developer/GStudioLITE_README/Capture1.JPG b/doc/developer/GStudioLITE_README/Capture1.JPG
diff --git a/doc/developer/GStudioLITE_README/Capture2.JPG b/doc/developer/GStudioLITE_README/Capture2.JPG
diff --git a/doc/developer/GStudioLITE_README/Capture3.JPG b/doc/developer/GStudioLITE_README/Capture3.JPG
diff --git a/doc/developer/GStudioLITE_README/Capture4.JPG b/doc/developer/GStudioLITE_README/Capture4.JPG
diff --git a/doc/developer/GStudioLITE_README/Capture5.JPG b/doc/developer/GStudioLITE_README/Capture5.JPG
diff --git a/doc/developer/GStudioLITE_README/GStudioLITE.md b/doc/developer/GStudioLITE_README/GStudioLITE.md
@@ -0,0 +1,124 @@
+## GSTUDIO-LITE: OPTIMIZATION OF GSTUDIO USING BACKEND CACHING WITH REVALIDATION
+The aim of the project was to use smart server side caching technologies to
+improve the performance of GStudio instances from a client perspective,
+while keeping the tradeoff between caching and dynamicity to a minimum.
+
+
+We decided to go with filesystem based caching for our purposes. Memcached, although faster can't store a lot of resources. Filesystem based caching needs a path to be set where-in the cached resources are going to be stored. Since we are using the secondary memory we can cache in a lot of resources for a very long period of time. The sections below indicate how we configured Django into the filesystem caching mode.
+
+### DJANGO CACHING ARGUMENTS
+![FileSystem Based Caching](Capture1.jpg).
+
+Each cache backend can be given additional arguments to control caching
+behavior. These arguments are provided as additional keys in the CACHES
+setting. Valid arguments are as follows:
+
+* **TIMEOUT**: The default timeout, in seconds, to use for the cache.
+This argument defaults to 300 seconds (5 minutes). You can set
+TIMEOUT to None so that, by default, cache keys never expire. A
+value of 0 causes keys to immediately expire (effectively don't cache).
+* **OPTIONS**: Any options that should be passed to the cache backend.
+The list of valid options will vary with each backend, and cache
+backends backed by a third-party library will pass their options
+directly to the underlying cache library.  
+Cache backends that implement their own culling strategy (i.e., the
+locmem, filesystem and database backends) will honor the following
+options:
+    * **MAX ENTRIES**: The maximum number of entries allowed in
+the cache before old values are deleted. This argument defaults
+to 300.
+    * **CULL FREQUENCY**: The fraction of entries that are culled
+when MAX ENTRIES is reached. The actual ratio is 1 / CULL FREQUENCY, so set CULL FREQUENCY to 2 to cull
+half the entries when MAX ENTRIES is reached. This argument
+should be an integer and defaults to 3.  
+A value of 0 for CULL FREQUENCY means that the entire
+cache will be dumped when MAX ENTRIES is reached. On
+some backends (database in particular) this makes culling much
+faster at the expense of more cache misses.  
+Memcached backends pass the contents of OPTIONS as keyword
+arguments to the client constructors, allowing for more advanced
+control of client behavior. For example usage, see below.
+*  **KEY PREFIX**: A string that will be automatically included
+(prepended by default) to all cache keys used by the Django server.
+* **VERSION**: The default version number for cache keys generated by
+the Django server.
+* **KEY FUNCTION**: A string containing a dotted path to a function
+that defines how to compose a prefix, version and key into a final
+cache key.
+
+### MIDDLEWARE
+![Middleware](Capture2.jpg).
+
+* **CACHE MIDDLEWARE ALIAS** : The cache alias to use for storage.
+* **CACHE MIDDLEWARE SECONDS** : The number of seconds each page
+should be cached.
+* **CACHE MIDDLEWARE KEY PREFIX** : If the cache is shared across
+multiple sites using the same Django installation, set this to the name of
+the site, or some other string that is unique to this Django instance, to
+prevent key collisions. Use an empty string if you dont care.
+* **FetchFromCacheMiddleware** caches GET and HEAD responses with status
+200, where the request and response headers allow. Responses to requests
+for the same URL with different query parameters are considered to be
+unique pages and are cached separately. This middleware expects that a
+HEAD request is answered with the same response headers as the
+corresponding GET request; in which case it can return a cached GET
+response for HEAD request.
+* Additionally, **UpdateCacheMiddleware** automatically sets a few headers in
+each HttpResponse:
+    * Sets the Expires header to the current date/time plus the defined
+**CACHE MIDDLEWARE SECONDS**.
+    * Sets the Cache-Control header to give a max age for the page again,
+from the **CACHE MIDDLEWARE SECONDS** setting.
+* If a view sets its own cache expiry time (i.e. it has a max-age section in its
+Cache-Control header) then the page will be cached until the expiry time,
+rather than **CACHE MIDDLEWARE SECONDS**. Using the decorators in
+django.views.decorators.cache you can easily set a views expiry time (using
+the cache control() decorator) or disable caching for a view (using the
+never cache() decorator)
+
+### FULL-SITE CACHING
+We were asked to develop a mechanism that would cache the entire site at user discretion. Hence we developed a crawler-script in Python that uses Beuatiful Soup and Selenium to crawl through all the pages on any web-site. As the requests are being made, the responses to them will be cached continuously in the Filesystem Backend. They will remain as such, until any update is made in any of the pages. In that case, a fresh response will be served and replace the stale response from the cache.
+
+### BEAUTIFUL SOUP LIBRARY
+Beautiful Soup is a Python library for pulling data out of HTML and XML
+files. It works with your favorite parser to provide idiomatic ways of
+navigating, searching, and modifying the parse tree. It commonly saves
+programmers hours or days of work. Beautiful Soup supports the HTML
+parser included in Python's standard library, but it also supports a number
+of third-party Python parsers.
+
+* To parse a document, pass it into the BeautifulSoup constructor:
+**soup=BeautifulSoup(content,"html.parser")**
+* **find_all()** method of the soup object helps us find something like all
+the "a" tags, or anything more complicated than the first tag with a
+certain name.
+* We can use the **get()** method as follows: tag.get('attr'), in order to
+get the value of the specified attribute in a particular tag.
+* **prettify()** method of the soup object converts the response object into
+a string.
+
+### SELENIUM
+WebDriver in Selenium is a tool for automating web application testing,
+and in particular to verify that they work as expected. WebDriver is the
+name of the key interface against which tests should be written, but there
+are several implementations. One such implementation is the Firefox
+driver, which controls the Firefox browser using a Firefox plugin.
+
+* The **get** method in the WebDriver API is used to navigate to the webpage
+whose URL is specified as an argument to the function.
+* The **page_source** object in the WebDriver API is used to render the content
+of the webpage opened by the WebDriver entirely as HTML.
+
+### CACHE REVALIDATION
+
+We set in place a HTTP cache revalidation mechanism, to revalidate our cache entries on each access so as to avoid sending old responses in case the pages in relevance have been updated since the cache was last updated. We used cache_control headers for that (refer the below section for documentation). cache_control headers ask the cache to revalidate it's entries by comparing timestamps of the cache entries and timestamps of records in the back-end Database. The following flowchart describes the mechanism.
+
+![Cache Revalidation](Capture5.jpg)
+
+### CACHE CONTROL DECORATOR
+The following decorator in django.views.decorators.cache controls server
+and client-side caching.
+
+![cache_control](Capture3.jpg)
+
+![patch_cache_control](Capture4.jpg).
diff --git a/gnowsys-ndf/gnowsys_ndf/ndf/views/Bib_App.py b/gnowsys-ndf/gnowsys_ndf/ndf/views/Bib_App.py
@@ -7,15 +7,16 @@
 from django_mongokit import get_database
 from django.contrib.auth.models import User
 from django.contrib.sites.models import Site
-
+from django.utils.decorators import method_decorator
+from django.views.decorators.cache import cache_control
 
 try:
     from bson import ObjectId
 except ImportError:  # old pymongo
     from pymongo.objectid import ObjectId
 
 from gnowsys_ndf.settings import GAPPS, MEDIA_ROOT
-from gnowsys_ndf.ndf.models import GSystemType, Node 
+from gnowsys_ndf.ndf.models import GSystemType, Node
 from gnowsys_ndf.ndf.views.methods import get_node_common_fields
 from gnowsys_ndf.ndf.views.notify import set_notif_val
 from gnowsys_ndf.ndf.views.methods import *
@@ -24,7 +25,7 @@
 sitename=Site.objects.all()
 if sitename :
     sitename = sitename[0]
-else : 
+else :
     sitename = ""
 
 db = get_database()
@@ -61,10 +62,10 @@
 dictionary={'techreport':["author","title","institution","year","type","number","address","month","note","key"]}
 Bibtex_entries.append(dictionary)
 dictionary={'unpublished':["author","title","note","month","year","key"]}
-Bibtex_entries.append(dictionary) 
+Bibtex_entries.append(dictionary)
 
 ##Bib_App function
-
+@cache_control(must_revalidate=True, max_age=6)
 def Bib_App(request, group_id):
     """
     Renders the main page of the app
@@ -86,12 +87,12 @@ def Bib_App(request, group_id):
     variable = RequestContext(request,{'title': title, 'group_id': group_id, 'groupid': group_id})
     return render_to_response(template,variable)
 
-
+@cache_control(must_revalidate=True, max_age=6)
 def view_entries(request, group_id,node_id=None):
     '''
     renders the list view of all entries of a specific type when node_id is known
     '''
-  
+
     ins_objectid  = ObjectId()
     if ins_objectid.is_valid(group_id) is False :
       group_ins = collection.Node.find_one({'_type': "Group","name": group_id})
@@ -104,7 +105,7 @@ def view_entries(request, group_id,node_id=None):
           group_id = str(auth._id)
     else :
         pass
-    
+
     if node_id is None:
         num = int(request.GET.get('num'))
         entry=Bibtex_entries[num]
@@ -124,7 +125,7 @@ def view_entries(request, group_id,node_id=None):
 
     return render_to_response(template,variable)
 
-
+@cache_control(must_revalidate=True, max_age=6)
 def view_entry(request,group_id,node_id):
     ''' renders list view of entries of a specific bibtex type when the type is known
     '''
@@ -147,6 +148,7 @@ def view_entry(request,group_id,node_id):
     variable = RequestContext(request, {'entry_inst': entry_inst, 'group_id': group_id, 'groupid': group_id,'title':title})
     return render_to_response(template,variable)
 
+@cache_control(must_revalidate=True, max_age=6)
 def view_sentry(request,group_id,node_id):
     ''' for displaying a specific entry
     '''
@@ -181,7 +183,7 @@ def view_sentry(request,group_id,node_id):
     print "before return"
     return render_to_response(template,variable)
 
-
+@cache_control(must_revalidate=True, max_age=6)
 def create_entries(request, group_id):
     ''' for creating a new bibtex entry_list
     '''
@@ -198,7 +200,7 @@ def create_entries(request, group_id):
                 group_id = str(auth._id)
     else :
         pass
-        ''' for retreiving the fields of a particular bibtex entry 
+        ''' for retreiving the fields of a particular bibtex entry
         '''
     num = int(request.GET.get('num'))
     entry=Bibtex_entries[num]
@@ -215,7 +217,7 @@ def create_entries(request, group_id):
                           'list_item':list_item,
                           'num':num
                       }
-    entry_node = collection.GSystem()  
+    entry_node = collection.GSystem()
     cite=""
     i=0
     value=""
@@ -249,7 +251,7 @@ def create_entries(request, group_id):
         get_node_common_fields(request,entry_node,group_id,GST_BIBTEX)
 
         entry_node.status=u'PUBLISHED'
-        
+
         entry_node.save()
         '''
         creating a GAttribute of AttributeType BibTex_entry for the already created GSystem
@@ -280,14 +282,14 @@ def create_entries(request, group_id):
         entry_list.object_value=unicode(value)
         entry_list.save()
         return HttpResponseRedirect(reverse('view_entry', kwargs={'group_id': group_id, 'node_id': GST_BIBTEX._id}))
-    else:     
+    else:
         return render_to_response("ndf/create_edit_entries.html",
                                   context_variables,
                                   context_instance=RequestContext(request))
 
 
 
-
+@cache_control(must_revalidate=True, max_age=6)
 def delete_sentry(request, group_id, node_id):
     """Change the status to Hidden.
     """
@@ -312,11 +314,11 @@ def delete_sentry(request, group_id, node_id):
     gst_bibtex=(s[11:-3])
     gst_bibtex=unicode(gst_bibtex, "UTF-8")
     op = collection.update({'_id': ObjectId(node_id)}, {'$set': {'status': u"HIDDEN"}})
-    
+
     return HttpResponseRedirect(reverse('view_entry', kwargs={'group_id': group_id,'node_id':gst_bibtex}))
-
 
 
+@cache_control(must_revalidate=True, max_age=6)
 def edit_entry(request,group_id,node_id):
     '''for editing entries
     '''
@@ -357,13 +359,13 @@ def edit_entry(request,group_id,node_id):
             values.append(each.split('$')[1])
         except:
             u="not found"
-    content_org=GST_current.content_org    
+    content_org=GST_current.content_org
     value=""
     Name=str(gst_entry.name)
     i=0
     key=""
     value=""
-    list_item=tags 
+    list_item=tags
     cite=""
     if request.method == "POST":
         name=request.POST.get("name")
@@ -392,7 +394,7 @@ def edit_entry(request,group_id,node_id):
                 cite += "page "+c+","
 
         var +="}"
-        
+
         GST_current.save()
         Bibtex=collection.Node.one({'name':'BibTex_entry','_type':'AttributeType'})
         Bibtex_entry=collection.Node.one({'subject':GST_current._id,'attribute_type':Bibtex._id})
@@ -414,4 +416,3 @@ def edit_entry(request,group_id,node_id):
     variable=RequestContext(request,{'group_id':group_id,'groupid':group_id,'title':GST_current.name ,'tags':tags,'values':values,'zipped':zipped,'content_org':content_org})
     template="ndf/edit_entry.html"
     return render_to_response(template,variable)
-