summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xgather.py27
1 files changed, 20 insertions, 7 deletions
diff --git a/gather.py b/gather.py
index 1948d8e..2507311 100755
--- a/gather.py
+++ b/gather.py
@@ -14,9 +14,10 @@ try:
except ModuleNotFoundError:
raise ModuleNotFoundError("emerge dev-python/beautifulsoup4 or pip install beautifulsoup4")
-if len(argv) != 1+1:
- raise ValueError("1st argument specifies the db URL in sqlalchemy format. example for sqlite: sqlite:///db")
+if len(argv) != 1+2:
+ raise ValueError("1st argument specifies the db URL in sqlalchemy format. example for sqlite: sqlite:///db, 2nd argument is operator contact that's sent via http in user-agent, for example mailto:email@address")
+operator_contact = argv[2]
engine = create_engine(argv[1], echo=True, future=True)
Base = declarative_base()
@@ -53,6 +54,7 @@ logger.debug("welcome to %s", argv[0])
Base.metadata.create_all(engine)
starting_acsm_id = 177238
+guaranteed_large_acsm_id = 1170487
logger.debug(f"created metadata.")
force_acsm_id = 0
@@ -60,6 +62,8 @@ force_acsm_id = 0
valid_acsms = 0
only_isbn_acsms = 0
failed_acsms = 0
+failed_acsms_not200 = 0
+failed_acsms_not200_in_a_row = 0
try:
with Session(engine) as session:
@@ -75,10 +79,19 @@ try:
else:
logger.info(f"continuing from latest {borrow}")
acsm_id = borrow.id+1
- r = requests.get(f"https://www.biblos.si/izposoja/prenesi/{acsm_id}.acsm")
+ r = requests.get(f"https://www.biblos.si/izposoja/prenesi/{acsm_id}.acsm", headers={"User-Agent": f"python-requests/{requests.__version__} (biblos-stat acsm scraper, contact operator: {operator_contact})"})
+ if (r.status_code == 200):
+ failed_acsms_not200_in_a_row = 0
if r.status_code != 200:
- logger.info(f"we are done for now, as server responded with {r.status_code} for queried acsm id {acsm_id}. latest borrow is {borrow}")
- break
+ logger.warning(f"received http response with error code not 200 (it is {r.status_code}). if this continues for {10-failed_acsms_not200_in_a_row} more requests, I'll assume there are no more borrows on the server.")
+ failed_acsms_not200 += 1
+ failed_acsms_not200_in_a_row += 1
+ force_acsm_id = acsm_id+1
+ if failed_acsms_not200_in_a_row == 10:
+ logger.info(f"we are done for now, as server responded with {r.status_code} for queried acsm id {acsm_id}, which means 10 concurrent responses that are not 200.")
+ if acsm_id < guaranteed_large_acsm_id:
+ logger.error(f"this shouldn't happen. I have a hardcoded value that tells me that at time of program writing, acsm id {guaranteed_large_acsm_id} did exist on the server. dying anyways.")
+ break
elif r.text.startswith('<error xmlns="http://ns.adobe.com/adept" data="E_URLLINK_NO_SUCH_RESOURCE resid urn:uuid:00000000-1002-0000-0009-78'):
isbn = int(r.text.split()[4].split("-").pop())+int(9e12)
borrow = Borrow(id=acsm_id, isbn=isbn, obtained=int(time()))
@@ -91,7 +104,7 @@ try:
force_acsm_id = acsm_id+1
failed_acsms += 1
else:
- acsm = BeautifulSoup(r.text, "xml")
+ acsm = BeautifulSoup(r.text, "xml", from_encoding="UTF-8")
ft = acsm.fulfillmentToken
expected = f"ACS-BIBL-L-{acsm_id}"
if ft.transaction.string != expected:
@@ -148,7 +161,7 @@ try:
except KeyboardInterrupt:
logger.warning(f"Keyboard interrupt. Exiting. I hope this terminated cleanly. Last requested acsm was discarded.")
-logger.info(f"In this session, {valid_acsms} valid acsms were stored, {only_isbn_acsms} acsms had only isbn and no other data available and {failed_acsms} acsms failed to be received. Last valid requested acsm was {acsm_id}. Thank you for cooperation.")
+logger.info(f"In this session, {valid_acsms} valid acsms were stored, {only_isbn_acsms} acsms had only isbn and no other data available and {failed_acsms} acsms failed to be received with response code 200 and {failed_acsms_not200} acsms failed to be received but did not return 200. Last valid requested acsm was {acsm_id}. Thank you for cooperation.")
"""
metadata = MetaData()