Python for humanists

Working with Web Data in Python

Overview

Teaching: 30 min
Exercises: 30 min
Questions
  • How can I use data to get specific online content?

Objectives
  • Be able to deal with Nan values.

  • Understand and set data types.

  • Be able to get web-based content.

  • Use a basic loop.

Not Formatted

This lesson has not been formatted and serves as a rough outline of the content that will be covered.

import requests as rq
from PIL import Image
from io import BytesIO
import pandas as pd
#Read in file
cushman_df = pd.read_csv("cushman_encoded.csv")
# Warning is because pandas is having trouble assigning types to the data
# This is not a failure, just something we'll be looking at in the next section
# Alternative way of resolving error - add dtype="object" to set everything as string
print(cushman_df.columns)
Index(['PURL', 'IU Archives Number', 'Cushman number', 'Roll ID',
       'Roll number', 'Year', 'Frame number', 'Start Date', 'End Date',
       'Archive Date', 'Month Unknown', 'Day Unknown', 'Camera settings',
       'Asterisk', 'Description from Notebook', 'Description from Slide Mount',
       'Image Note', 'No Slide Exists', 'Slide Condition', 'ID',
       'Street Address', 'Street Address 2', 'Street', 'Street 2', 'Street 3',
       'Street 4', 'Street 5', 'Neighborhood', 'City', 'City 2', 'City 3',
       'City 4', 'County', 'County 2', 'County 3', 'County 4',
       'State/Province', 'State/Province 2', 'State/Province 3',
       'State/Province 4', 'Country', 'Country 2', 'Country 3', 'Country 4',
       'Personal Names 1', 'Personal Names 2', 'Personal Names 3',
       'Personal Names 4', 'Personal Names 5', 'Personal Names 6',
       'Personal Names 7', 'Personal Names 8', 'Corporate Names 1',
       'Corporate Names 2', 'Corporate Names 3', 'Corporate Names 4',
       'Corporate Names 5', 'Corporate Names 6', 'Corporate Names 7',
       'Corporate Names 8', 'Other Names 1', 'Other Names 2', 'Other Names 3',
       'Other Names 4', 'Genre 1', 'Genre 2', 'Genre 3', 'Genre 4',
       'Topical Subject Headings 1', 'Topical Subject Headings 2',
       'Topical Subject Headings 3', 'Topical Subject Headings 4',
       'Topical Subject Headings 5', 'Topical Subject Headings 6',
       'Topical Subject Headings 7', 'Topical Subject Headings 8',
       'Topical Subject Headings 9', 'Topical Subject Headings 10',
       'Topical Subject Headings 11', 'Topical Subject Headings 12', 'Other 1',
       'Other 2', 'Other 3', 'Other 4', 'Other 5', 'Other 6', 'Other 7',
       'Other 8', 'ColorCorrected', 'LastUpdated'],
      dtype='object')


/Users/tmorrell/anaconda/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (25,26,39,43,48,57,58,86) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
#We want the Identifier
print(cushman_df['IU Archives Number'][0:5])
0    1411.0
1    1412.0
2    1413.0
3    1414.0
4    1415.0
Name: IU Archives Number, dtype: float64
#It shouldn't be a float
#Let's convert it to int
cushman_df['IU Archives Number'] = cushman_df['IU Archives Number'].astype('int')
---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

<ipython-input-4-b124fd8598e1> in <module>()
      1 #It shouldn't be a float
      2 #Let's convert it to int
----> 3 cushman_df['IU Archives Number'] = cushman_df['IU Archives Number'].astype('int')


/Users/tmorrell/anaconda/lib/python3.5/site-packages/pandas/core/generic.py in astype(self, dtype, copy, raise_on_error, **kwargs)
   3052         # else, only a single dtype is given
   3053         new_data = self._data.astype(dtype=dtype, copy=copy,
-> 3054                                      raise_on_error=raise_on_error, **kwargs)
   3055         return self._constructor(new_data).__finalize__(self)
   3056 


/Users/tmorrell/anaconda/lib/python3.5/site-packages/pandas/core/internals.py in astype(self, dtype, **kwargs)
   3187 
   3188     def astype(self, dtype, **kwargs):
-> 3189         return self.apply('astype', dtype=dtype, **kwargs)
   3190 
   3191     def convert(self, **kwargs):


/Users/tmorrell/anaconda/lib/python3.5/site-packages/pandas/core/internals.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
   3054 
   3055             kwargs['mgr'] = self
-> 3056             applied = getattr(b, f)(**kwargs)
   3057             result_blocks = _extend_blocks(applied, result_blocks)
   3058 


/Users/tmorrell/anaconda/lib/python3.5/site-packages/pandas/core/internals.py in astype(self, dtype, copy, raise_on_error, values, **kwargs)
    459                **kwargs):
    460         return self._astype(dtype, copy=copy, raise_on_error=raise_on_error,
--> 461                             values=values, **kwargs)
    462 
    463     def _astype(self, dtype, copy=False, raise_on_error=True, values=None,


/Users/tmorrell/anaconda/lib/python3.5/site-packages/pandas/core/internals.py in _astype(self, dtype, copy, raise_on_error, values, klass, mgr, **kwargs)
    502 
    503                 # _astype_nansafe works fine with 1-d only
--> 504                 values = _astype_nansafe(values.ravel(), dtype, copy=True)
    505                 values = values.reshape(self.shape)
    506 


/Users/tmorrell/anaconda/lib/python3.5/site-packages/pandas/types/cast.py in _astype_nansafe(arr, dtype, copy)
    529 
    530         if np.isnan(arr).any():
--> 531             raise ValueError('Cannot convert NA to integer')
    532     elif arr.dtype == np.object_ and np.issubdtype(dtype.type, np.integer):
    533         # work around NumPy brokenness, #1987


ValueError: Cannot convert NA to integer
#We have NA values - let's ignore them
subset = cushman_df[pd.notnull(cushman_df['IU Archives Number'])].copy()
#Now we can 
print(subset['IU Archives Number'].dtype)
subset['IU Archives Number'] = subset['IU Archives Number'].astype('int')
print(subset['IU Archives Number'].dtype)
float64
int64
#Now we're going to get the images
base = 'http://purl.dlib.indiana.edu/iudl/archives/cushman/full/P0'
#We use a loop to look through the first 10 rows
for i in range(10):
    # we get the record ID number
    number = subset['IU Archives Number'][i]
    # Translate it into a filename
    url = base + str(number) + '.jpg'
    print(url)
    # we use the response library to get the image in bytes
    response = rq.get(url)
    # we use the image library to translate the bytes to an image
    i = Image.open(BytesIO(response.content))
    # and we save the image
    i.save(str(number) + '.jpg')
    print(number)
http://purl.dlib.indiana.edu/iudl/archives/cushman/full/P01411.jpg
1411
http://purl.dlib.indiana.edu/iudl/archives/cushman/full/P01412.jpg
1412
http://purl.dlib.indiana.edu/iudl/archives/cushman/full/P01413.jpg
1413
http://purl.dlib.indiana.edu/iudl/archives/cushman/full/P01414.jpg
1414
http://purl.dlib.indiana.edu/iudl/archives/cushman/full/P01415.jpg
1415
http://purl.dlib.indiana.edu/iudl/archives/cushman/full/P01416.jpg
1416
http://purl.dlib.indiana.edu/iudl/archives/cushman/full/P01417.jpg
1417
http://purl.dlib.indiana.edu/iudl/archives/cushman/full/P01418.jpg
1418
http://purl.dlib.indiana.edu/iudl/archives/cushman/full/P01419.jpg
1419
http://purl.dlib.indiana.edu/iudl/archives/cushman/full/P01420.jpg
1420

Key Points