WebHost: Add `robots.txt` to WebHost (#3157)
* Add a `robots.txt` file to prevent crawlers from scraping the site * Added `ASSET_RIGHTS` entry to config.yaml to control whether `/robots.txt` is served or not * Always import robots.py, determine config in route function * Finish writing a comment * Remove unnecessary redundant import and config
This commit is contained in:
		
							parent
							
								
									915ad61ecf
								
							
						
					
					
						commit
						ad4451276d
					
				| 
						 | 
				
			
			@ -23,7 +23,6 @@ def get_app():
 | 
			
		|||
    from WebHostLib import register, cache, app as raw_app
 | 
			
		||||
    from WebHostLib.models import db
 | 
			
		||||
 | 
			
		||||
    register()
 | 
			
		||||
    app = raw_app
 | 
			
		||||
    if os.path.exists(configpath) and not app.config["TESTING"]:
 | 
			
		||||
        import yaml
 | 
			
		||||
| 
						 | 
				
			
			@ -34,6 +33,7 @@ def get_app():
 | 
			
		|||
        app.config["HOST_ADDRESS"] = Utils.get_public_ipv4()
 | 
			
		||||
        logging.info(f"HOST_ADDRESS was set to {app.config['HOST_ADDRESS']}")
 | 
			
		||||
 | 
			
		||||
    register()
 | 
			
		||||
    cache.init_app(app)
 | 
			
		||||
    db.bind(**app.config["PONY"])
 | 
			
		||||
    db.generate_mapping(create_tables=True)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -51,6 +51,7 @@ app.config["PONY"] = {
 | 
			
		|||
app.config["MAX_ROLL"] = 20
 | 
			
		||||
app.config["CACHE_TYPE"] = "SimpleCache"
 | 
			
		||||
app.config["HOST_ADDRESS"] = ""
 | 
			
		||||
app.config["ASSET_RIGHTS"] = False
 | 
			
		||||
 | 
			
		||||
cache = Cache()
 | 
			
		||||
Compress(app)
 | 
			
		||||
| 
						 | 
				
			
			@ -82,6 +83,6 @@ def register():
 | 
			
		|||
 | 
			
		||||
    from WebHostLib.customserver import run_server_process
 | 
			
		||||
    # to trigger app routing picking up on it
 | 
			
		||||
    from . import tracker, upload, landing, check, generate, downloads, api, stats, misc
 | 
			
		||||
    from . import tracker, upload, landing, check, generate, downloads, api, stats, misc, robots
 | 
			
		||||
 | 
			
		||||
    app.register_blueprint(api.api_endpoints)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -0,0 +1,14 @@
 | 
			
		|||
from WebHostLib import app
 | 
			
		||||
from flask import abort
 | 
			
		||||
from . import cache
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@cache.cached()
 | 
			
		||||
@app.route('/robots.txt')
 | 
			
		||||
def robots():
 | 
			
		||||
    # If this host is not official, do not allow search engine crawling
 | 
			
		||||
    if not app.config["ASSET_RIGHTS"]:
 | 
			
		||||
        return app.send_static_file('robots.txt')
 | 
			
		||||
 | 
			
		||||
    # Send 404 if the host has affirmed this to be the official WebHost
 | 
			
		||||
    abort(404)
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,20 @@
 | 
			
		|||
User-agent: Googlebot
 | 
			
		||||
Disallow: /
 | 
			
		||||
 | 
			
		||||
User-agent: APIs-Google
 | 
			
		||||
Disallow: /
 | 
			
		||||
 | 
			
		||||
User-agent: AdsBot-Google-Mobile
 | 
			
		||||
Disallow: /
 | 
			
		||||
 | 
			
		||||
User-agent: AdsBot-Google-Mobile
 | 
			
		||||
Disallow: /
 | 
			
		||||
 | 
			
		||||
User-agent: Mediapartners-Google
 | 
			
		||||
Disallow: /
 | 
			
		||||
 | 
			
		||||
User-agent: Google-Safety
 | 
			
		||||
Disallow: /
 | 
			
		||||
 | 
			
		||||
User-agent: *
 | 
			
		||||
Disallow: /
 | 
			
		||||
| 
						 | 
				
			
			@ -50,3 +50,7 @@
 | 
			
		|||
 | 
			
		||||
# Host Address.  This is the address encoded into the patch that will be used for client auto-connect.
 | 
			
		||||
#HOST_ADDRESS: archipelago.gg
 | 
			
		||||
 | 
			
		||||
# Asset redistribution rights.  If true, the host affirms they have been given explicit permission to redistribute
 | 
			
		||||
# the proprietary assets in WebHostLib
 | 
			
		||||
#ASSET_RIGHTS: false
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue