Vacation Planning (with BeautifulSoup and NLTK)

Mine/scrape online reviews for a given hotel, then generate word clouds from reviewers' most commonly used adjectives. 

This weekend I was tasked with choosing a hotel for an upcoming vacation. TripAdvisor contains dozens (if not hundreds) of user reviews and photographs for any given hotel. The user-interface is easy enough to navigate and the "rating" system is fairly informative. However, when the number of reviews for a given hotel reaches statistical significance, I think TripAdvisor would benefit by providing more sophisticated lexical analyses.

The code below will use web scraping techniques (via BeautifulSoup) to harvest user reviews from TripAdvisor; it will then filter out adjectives with the NLTK wordnet synsets ("synonym set") and generate word clouds. Enjoy!


Step 1: Grab a URL

By clicking through a few pages of reviews, it's clear that the "or10" string in the URL is an iterator, which tracks page count as you navigate through the reviews. You'll want to grab a URL which contains this iterator. If you don't see it, click to the next page of reviews and it should appear.

Step 2: Replace the iterator with "ASDF", then add it to the "urls" dictionary in the code below

from bs4 import BeautifulSoup
from urllib2 import urlopen
import unicodedata
import string

urls = {}
urls['paradise island'] = """"
urls['parisian'] = ""
hotels = urls.keys()

class Review_object():

    ''' Class for storing review content.
        Unsure how many attributes I'll
        collect here...we'll start with
        text content and an index '''
    def __init__(self, content, idx):
        self.content = content
        self.idx = idx

def make_soup(url):
    r = urlopen(url).read()
    soup = BeautifulSoup(r)
    return soup

def remove_punctuation(input_string):
    ''' I'm only interested in the
    word content. Not the punctuation.
    I also need to handle unicode. We'll 
    convert unicode to ascii here and strip
    punctuation. '''
    exclude = set(string.punctuation)
        s = input_string.replace('\t', ' ').replace('\n', ' ').replace('-', ' ')
        # unsure why this is occasionally necessary
        s = ''.join(xx for xx in input_string.contents)
        s = s.replace('\t', ' ').replace('\n', ' ').replace('-', ' ')

    s = ''.join(ch for ch in s if ch not in exclude)
    s = unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')
    s = ''.join(ch for ch in s if ch.isalpha() or ch==' ')
    return s.lower()

def dump_data(data, hotel_name):
    ''' Save data to file. '''
    url = urls[hotel_name]
    outfile = open('dump_' + hotel_name + '.csv', 'w')
    outfile.write(url + '\n')
    for each_object in data:
        content = remove_punctuation(each_object.content)
        outfile.write(str(each_object.idx) + ',' + content + '\n')

data_dictionary = {}
for each_hotel in hotels:
    global_index = 0
    print "\nScraping",each_hotel,"reviews."
    # first, let's see how many pages of reviews exist
    soup = make_soup(urls[each_hotel].replace('ASDF','0'))
    page_links = [xx.attrs for xx in soup.find_all('a') if 'data-page-number' in xx.attrs.keys()]
    pages = [int(xx['data-page-number']) for xx in page_links]
    print "Tripadvisor contains",max(pages),"pages of reviews for this hotel."

    # now we'll iterate through each page
    data = []
    for ii in range(max(pages)+1):
        print ii, 
        url = urls[each_hotel].replace('ASDF', str(ii))
        soup = make_soup(url)
        # find all review titles
        titles = [xx.contents[0] for xx in
                  soup.find_all("span", class_="noQuotes")]
        # find all review entries
        pentries = [xx.contents[0] for xx in
                    soup.find_all("p", class_="partial_entry")]
        # add to dataset
        for eachtitle in titles:
            data.append(Review_object(eachtitle, global_index))
            global_index += 1
        for eachentry in pentries:
            if len(eachentry) > 2:
                data.append(Review_object(eachentry, global_index))
                global_index += 1

    print "\nFinished analyzing " + each_hotel
    data_dictionary[each_hotel] = data
    print "Saving data to dump_%s.csv." % (each_hotel)
    dump_data(data, each_hotel)

from nltk.corpus import wordnet as wn
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

datafiles = ['dump_'+xx+'.csv' for xx in hotels]
excludedwords = ['room','hotel','motel']
wordcount = pd.DataFrame()

for eachfile in datafiles:
    infile = open(eachfile,'r')
    words = []
    # Let's clean the data a bit by removing numbers
    for eachline in infile:
        linewords = [xx.lower() for xx in eachline.split(' ')]
        for eachword in linewords:
            words.append(''.join(ch for ch in eachword if ch.isalpha()))
    words = [xx for xx in words if xx not in excludedwords]
    words_unique = np.unique(words)

    # Now we'll filter by parts of speech
    adjectives = []
    for eachword in words_unique:
      ss = wn.synsets(eachword)
      if len(ss) > 0:
        pos = [xx.pos() for xx in ss]
        if 's' in pos:

    # Generate some word clouds
    all_adjectives = [xx for xx in words if xx in adjectives]

    wc = WordCloud(background_color="white", margin=5, width=600, height=300)
    wc2 = WordCloud(background_color="white", margin=5, width=600, height=300)
    if 'isian' in eachfile:
        wc2 = WordCloud(background_color="black", margin=5, width=600, height=300)
    wordcloud = wc.generate(' '.join(words))
    adjectivecloud = wc2.generate(' '.join(all_adjectives))

    # Print top ten
    print eachfile
    wordcount['all_' + eachfile] = [xx[0] for xx in wordcloud.words_[0:10]]
    wordcount['adj_' + eachfile] = [xx[0] for xx in adjectivecloud.words_[0:10]]
    plt.title(eachfile[0:-4].replace('dump_','') + ' all')
    plt.savefig(eachfile[0:-4] + '_all.png',dpi=600)

    plt.title(eachfile[0:-4].replace('dump_','') + ' adjectives')
    plt.savefig(eachfile[0:-4] + '_adjectives.png',dpi=600)