I'm trying to create a webscraping Flask app that creates a new Webdriver instance for each user session, so that different users can scrape content from different pages. This would be simpler if the driver.get() and data collection happened in the same API call, but they can't due to the nature of the scraping I'll be doing. Here's what I have so far:
from flask import Flask, session
from flask_session import Session
app = Flask(__name__)
SESSION_TYPE = 'filesystem'
app.config.from_object(__name__)
Session(app)
@app.route('/get_site/<link>', methods=['GET'])
def get_site(link):
session['driver'] = webdriver.Chrome(options=options)
session['driver'].get(link)
return 'link opened!' # confirmation message
@app.route('/scrape_current_site', methods=['GET'])
def scrape_current_site():
return session['driver'].title # collecting arbitrary data from page
app.run()
This doesn't work, though:
AttributeError: Can't pickle local object '_createenviron.<locals>.encode'
Conceptually a flask session feels like it's what I'm looking for (a new, unique object for each session), but I can't figure out how to get it to work.
CodePudding user response:
Store the browser instance in a global dictionary,
and use the session to store a user_id which is a key for you dict.
Also you should keep in mine if we initiate a browser for each user
we also should close it if user does not send any more request after a while.
I've used this answer to make a background timer program to clear non used browsers as well.
import atexit
import threading
import uuid
import time
from flask import Flask, session
app = Flask(__name__)
app.secret_key = 'any random string'
POOL_TIME = 5 #Seconds
MAX_OPEN_BROWSER_TIME = 60 #Seconds
#stores browser instance as value and user_id as key between requests
userBrowsers = {}
#user_id as key and unix time of last browser usage as value
userLastuse = {}
dataLock = threading.Lock()
timerThread = threading.Thread()
def create_app():
app = Flask(__name__)
def interrupt():
global timerThread
timerThread.cancel()
def deleteUnusedBrowsers():
global userBrowsers
global timerThread
with dataLock:
for userId, lastuse in userLastuse.items():
if time.time() - lastuse > MAX_OPEN_BROWSER_TIME:
del userBrowsers[userId]
# runs this function after a delay
timerThread = threading.Timer(POOL_TIME, deleteUnusedBrowsers, ())
timerThread.start()
def startTimer():
global timerThread
timerThread = threading.Timer(POOL_TIME, deleteUnusedBrowsers, ())
timerThread.start()
startTimer()
# When you kill Flask (SIGTERM), clear the trigger for the next thread
atexit.register(interrupt)
return app
app = create_app()
@app.route('/open_site/<url>', methods=['GET'])
def open_site(url):
browser = get_browser()
browser.get(url)
return 'site opened!'
def get_browser():
# check if user have an id in session or assign an id to user
user_id = session.get("session-id")
if user_id is None:
user_id = uuid.uuid4()
session["session-id"] = user_id
# check if user have a browser instance or create one for user
browser = userBrowsers.get(user_id)
if browser is None:
browser = webdriver.Chrome(options=options)
userBrowsers[user_id] = browser
# updates last use time of browser
userLastuse[user_id] = time.time()
return browser
CodePudding user response:
You could try the g module which persists over multiple requests from the same user
from flask import g
And attach the driver to g e.g.
g._driver = webdriver.Chrome(options=options)
g._driver.get(link)
