Hi I am trying to download images from BGS borehole scans where there are more than one page e.g. http://scans.bgs.ac.uk/sobi_scans/boreholes/795279/images/10306199.html http://scans.bgs.ac.uk/sobi_scans/boreholes/18913699/images/18910430.html
I manage to download the first 2 pages of the first example but when I get to the last page I get this error. On this page the NextPage variable should be None as that tag is not there on the webpage. At this point I want to continue to the next location, I haven't added that yet but I have an excel list of URL's. The code is based on this https://automatetheboringstuff.com/2e/chapter12/
Traceback (most recent call last):
File "C:/Users/brentond/Documents/Python/Pdf BGS Scans.py", line 73, in
NextPage = soup.select('a[title="Next page"]')[0]
IndexError: list index out of range
Download BGS borehole scans from excel list of URL's
import pyautogui
import pyperclip
import webbrowser
import PyPDF2
import os
import openpyxl
import pdfkit
import requests
import bs4
# Define path of excel file
from requests import Response
path = r'C:UsersrentondDocumentsTA2'
# Change directory to target location
os.chdir(path)
# Create workbook object
wb = openpyxl.load_workbook('BGS Boreholes.xlsm')
# Create worksheet object
ws = wb.get_sheet_by_name('Open')
# Assign URL to variable
StartURL = ws['A2'].value
URL = StartURL
NextURL = "NextURL"
# Assign BH ID to variable
Location = ws['B2'].value
while NextURL is not None:
# Download URL
res = requests.get(URL) # type: Response
res.raise_for_status()
# Create beautiful soup object
soup = bs4.BeautifulSoup(res.text, 'html.parser')
# Find the URL of the borehole scan image.
Scan = soup.select('#image_content img')
# Check on HTML elements
Address = soup.select('#image')
AddressText = Address[0].get('src')
print(AddressText)
print()
if Scan == []:
print('Could not find scan image.')
else:
ScanUrl = Scan[0].get('src')
# Download the image.
print('Downloading image %s...' % (ScanUrl))
res = requests.get(ScanUrl)
res.raise_for_status()
# Save the image to path
PageNo = 0
imageFile = open(os.path.join(path, Location) + "-Page" + str(PageNo) + ".png", 'wb')
for chunk in res.iter_content(100000):
imageFile.write(chunk)
imageFile.close()
# Find URL for next page
PageNo = PageNo + 1
NextPage = soup.select('a[title="Next page"]')[0]
if NextPage ==[]:
continue
else:
print(NextPage)
NextURL = NextPage.get('href')
URL = NextURL
print(NextURL)
print('Done.')
与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…