WebHost: Add `robots.txt` to WebHost (#3157)

* Add a `robots.txt` file to prevent crawlers from scraping the site

* Added `ASSET_RIGHTS` entry to config.yaml to control whether `/robots.txt` is served or not

* Always import robots.py, determine config in route function

* Finish writing a comment

* Remove unnecessary redundant import and config
This commit is contained in:
Chris Wilson 2024-04-20 20:58:56 -04:00 committed by GitHub
parent 915ad61ecf
commit ad4451276d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 44 additions and 5 deletions

View File

@ -23,7 +23,6 @@ def get_app():
from WebHostLib import register, cache, app as raw_app from WebHostLib import register, cache, app as raw_app
from WebHostLib.models import db from WebHostLib.models import db
register()
app = raw_app app = raw_app
if os.path.exists(configpath) and not app.config["TESTING"]: if os.path.exists(configpath) and not app.config["TESTING"]:
import yaml import yaml
@ -34,6 +33,7 @@ def get_app():
app.config["HOST_ADDRESS"] = Utils.get_public_ipv4() app.config["HOST_ADDRESS"] = Utils.get_public_ipv4()
logging.info(f"HOST_ADDRESS was set to {app.config['HOST_ADDRESS']}") logging.info(f"HOST_ADDRESS was set to {app.config['HOST_ADDRESS']}")
register()
cache.init_app(app) cache.init_app(app)
db.bind(**app.config["PONY"]) db.bind(**app.config["PONY"])
db.generate_mapping(create_tables=True) db.generate_mapping(create_tables=True)

View File

@ -51,6 +51,7 @@ app.config["PONY"] = {
app.config["MAX_ROLL"] = 20 app.config["MAX_ROLL"] = 20
app.config["CACHE_TYPE"] = "SimpleCache" app.config["CACHE_TYPE"] = "SimpleCache"
app.config["HOST_ADDRESS"] = "" app.config["HOST_ADDRESS"] = ""
app.config["ASSET_RIGHTS"] = False
cache = Cache() cache = Cache()
Compress(app) Compress(app)
@ -82,6 +83,6 @@ def register():
from WebHostLib.customserver import run_server_process from WebHostLib.customserver import run_server_process
# to trigger app routing picking up on it # to trigger app routing picking up on it
from . import tracker, upload, landing, check, generate, downloads, api, stats, misc from . import tracker, upload, landing, check, generate, downloads, api, stats, misc, robots
app.register_blueprint(api.api_endpoints) app.register_blueprint(api.api_endpoints)

14
WebHostLib/robots.py Normal file
View File

@ -0,0 +1,14 @@
from WebHostLib import app
from flask import abort
from . import cache
@cache.cached()
@app.route('/robots.txt')
def robots():
# If this host is not official, do not allow search engine crawling
if not app.config["ASSET_RIGHTS"]:
return app.send_static_file('robots.txt')
# Send 404 if the host has affirmed this to be the official WebHost
abort(404)

View File

@ -0,0 +1,20 @@
User-agent: Googlebot
Disallow: /
User-agent: APIs-Google
Disallow: /
User-agent: AdsBot-Google-Mobile
Disallow: /
User-agent: AdsBot-Google-Mobile
Disallow: /
User-agent: Mediapartners-Google
Disallow: /
User-agent: Google-Safety
Disallow: /
User-agent: *
Disallow: /

View File

@ -1,4 +1,4 @@
# This is a sample configuration for the Web host. # This is a sample configuration for the Web host.
# If you wish to change any of these, rename this file to config.yaml # If you wish to change any of these, rename this file to config.yaml
# Default values are shown here. Uncomment and change the values as desired. # Default values are shown here. Uncomment and change the values as desired.
@ -25,7 +25,7 @@
# Secret key used to determine important things like cookie authentication of room/seed page ownership. # Secret key used to determine important things like cookie authentication of room/seed page ownership.
# If you wish to deploy, uncomment the following line and set it to something not easily guessable. # If you wish to deploy, uncomment the following line and set it to something not easily guessable.
# SECRET_KEY: "Your secret key here" # SECRET_KEY: "Your secret key here"
# TODO # TODO
#JOB_THRESHOLD: 2 #JOB_THRESHOLD: 2
@ -38,7 +38,7 @@
# provider: "sqlite" # provider: "sqlite"
# filename: "ap.db3" # This MUST be the ABSOLUTE PATH to the file. # filename: "ap.db3" # This MUST be the ABSOLUTE PATH to the file.
# create_db: true # create_db: true
# Maximum number of players that are allowed to be rolled on the server. After this limit, one should roll locally and upload the results. # Maximum number of players that are allowed to be rolled on the server. After this limit, one should roll locally and upload the results.
#MAX_ROLL: 20 #MAX_ROLL: 20
@ -50,3 +50,7 @@
# Host Address. This is the address encoded into the patch that will be used for client auto-connect. # Host Address. This is the address encoded into the patch that will be used for client auto-connect.
#HOST_ADDRESS: archipelago.gg #HOST_ADDRESS: archipelago.gg
# Asset redistribution rights. If true, the host affirms they have been given explicit permission to redistribute
# the proprietary assets in WebHostLib
#ASSET_RIGHTS: false