Python: Script to pull text data












1












$begingroup$


This is built with Python 2.7.15.



The goal of this script is to count the number of words spoken by each Senator on the floor of Congress between given dates.



It pulls from the Congressional Record, cuts out the sections not spoken on the floor (eg lists of Amendments) and returns a count of words by senator. The current list of senators is from the 115th Congress, from 2017 to 2018



# -*- coding: utf-8 -*-
from requests import get # to make GET request
import time
from datetime import date, timedelta
import os.path
import textract
import re
import csv
import sys

#download URLs
def downloadPDF(url, file_name):
#get request
response = get(url)
content_type = response.headers.get('content-type')

if content_type == "application/pdf":
# open in binary mode
save_path = '/Users/One/Document/Workspace/Projects/Senate/Congressional_Record/'
path = os.path.join(save_path, file_name)

with open(path, "wb") as file:
#write to file
file.write(response.content)
return 'SUCCESS'

#download congressional records and extract text
def downloadConRecords(startDate, endDate): #YYYYMMDD
#initialize lists
congressionalRecords =
dates =

#convert args to date-type
startDate = str(startDate)
endDate = str(endDate)
startDate = date(int(startDate[0:4]),int(startDate[4:6]),int(startDate[6:8]))
endDate = date(int(endDate[0:4]),int(endDate[4:6]),int(endDate[6:8]))

#list all the days to check
range = endDate - startDate
i=0
while i <= range.days:
dates += [str(startDate + timedelta(days=i))]
i+=1

#try URL, download PDF, add to list object
for i in dates:
YYYY = i[0:4]
MM = i[5:7]
DD = i[8:10]
url = ('https://www.congress.gov/crec/' + YYYY + '/'+ MM + '/' + DD + '/CREC-' + YYYY + '-' + MM + '-' + DD + '-senate.pdf')
filename = (YYYY+MM+DD+".pdf")

time.sleep(20)

if downloadPDF(url,filename) == "SUCCESS":
print(url)
textExtract =
textExtract = textract.process('/Users/One/Document/Workspace/Projects/Senate/Congressional_Record/' + filename)
#makes a list with the filename and the text
entry = [url, filename[:-4].strip(), textExtract]
congressionalRecords += [entry]

return congressionalRecords

#process the congressional records
def cleanRecords(congressionalRecords, phraseDelete, segmentStart, segmentEnd):
#eliminate n/, phrases, segments
for record in congressionalRecords:
clean =
dump =
clean = record[2].split('n')
clean = " ".join(clean)

for i in phraseDelete:
clean = clean.replace(i,'')

for start in enumerate(segmentStart):
erase = re.findall('{}.*?{}'.format(start[1], segmentEnd), clean)
clean = re.sub('{}.*?{}'.format(start[1], segmentEnd), ' ', clean)
dump += [erase]

record += [clean]
record += [dump]

return congressionalRecords

#split the text into sections based on the speaker
def multiDelimStringSplitter(aString, separators):
# separators is an array of strings that are being used to split the the string.
# sort separators in order of descending length
separators.sort(key=len)
listToReturn =
rows =
l =
i = 0

while i < len(aString):
theSeparator = ""
for current in separators:
if current == aString[i:i+len(current)]:
#if this section is the separateor, then set the variable to that separator
theSeparator = current
if theSeparator != "":
listToReturn += [theSeparator]
i = i + len(theSeparator)
else:
if listToReturn == :
listToReturn = [""]
if(listToReturn[-1] in separators):
listToReturn += [""]
#adds the text one character at a time
listToReturn[-1] += aString[i]
i += 1

i = 0
listToReturn = listToReturn[1:len(listToReturn)]

while (i+1) < len(listToReturn):
#initialize entry with speaker and text
entry = listToReturn[i:i+2]
#get word count and add it to the entry
text = listToReturn[i+1]
#strip punctuation out of the string
text = re.sub(r'[^ws]','',text)
wordcount = str(len(text.split()))
entry += [wordcount]
#add the entry as its own list
rows.append(entry)
i += 2

return rows

#get information on the records, speakers and word counts
def writeAnalytics(congressionalRecords, senators):
textExport = [['url','date','extractRaw','extractClean']]
textDump = [['date','extractDump']]
speechExport = [['date','speaker','text','wordCount']]
speakerExport = [['date','speaker','wordCount']]
speakerAnalytics = [['startDate', 'endDate', 'speaker', 'wordCount']]
perSpeakerTotal =
recordDates =

for i in senators:
# wordsperSpeech += [i,0]
perSpeakerTotal += [i,0]

#add delimiter separated text extracts
for record in congressionalRecords:
splitLines =
wordsperSpeaker =

#initialize the lists
for i in senators:
# wordsperSpeech += [i,0]
wordsperSpeaker += [i,0]

#split the extracts into each speech
splitLines = multiDelimStringSplitter(record[3],senators)
record += [splitLines]

#prep the dump export
for i in record[4]:
if len(i) > 0:
textDump += [[record[1], i]]

#add the words per speaker
for speech in record[5]:
speechExport += [[record[1],speech[0],speech[1],speech[2]]]
#find the speaker -> pull that person's wordsperSpeaker and add the speech's word count
wordsperSpeaker[wordsperSpeaker.index(speech[0])+1] += int(speech[2])

record += [wordsperSpeaker]
textExport += [[record[0],record[1],record[2],record[3]]]
recordDates += [record[1]]

#prepare export for the words per speaker
for i in senators:
speakerExport += [[record[1],i, wordsperSpeaker[wordsperSpeaker.index(i)+1]]]
#add word counts for senators who spoke
#find senator's index in perSpeaker total -> add the words spoken in that session (record)
perSpeakerTotal[perSpeakerTotal.index(i)+1] += record[6][record[6].index(i)+1]

for i in senators:
speakerAnalytics += [[min(recordDates), max(recordDates), i, perSpeakerTotal[perSpeakerTotal.index(i)+1]]]

return textExport, textDump, speechExport, speakerExport, speakerAnalytics

#export to CSV
def exportCSV(flatList, filename):
with open(filename+".csv", "wb") as f:
writer = csv.writer(f)
writer.writerows(flatList)

phraseDelete = [
',',
'This ‘‘bullet’’ symbol identifies statements or insertions which are not spoken by a Member of the Senate on the floor.'
]

segmentStart = [
' PRAYER ',
' PLEDGE OF ALLEGIANCE ',
' APPOINTMENT OF ACTING PRESIDENT PRO TEMPORE ',
' CERTIFICATES OF ELECTION ',
' MESSAGE FROM THE HOUSE ',
' MESSAGES FROM THE PRESIDENT ',
' EXECUTIVE MESSAGES REFERRED ',
' LIST OF SENATORS BY STATES ',
' PRESIDENTIAL MESSAGES ',
' ENROLLED BILLS PRESENTED ',
' MEASURES REFERRED ',
' EXECUTIVE AND OTHER COMMUNICATIONS ',
' REPORTS OF COMMITTEES ',
' EXECUTIVE REPORTS OF COMMITTEES ',
' AMENDMENTS SUBMITTED AND PROPOSED ',
' INTRODUCTION OF BILLS AND JOINT RESOLUTIONS ',
' ADDITIONAL COSPONSORS ',
' SUBMITTED RESOLUTIONS ',
' SUBMISSION OF CONCURRENT AND SENATE RESOLUTIONS ',
' SENATE RESOLUTION ',
' TEXT OF AMENDMENTS ',
' APPOINTMENT ',
' AUTHORITY FOR COMMITTEES TO MEET ',
' CONFIRMATION ',
' NOMINATION '
]

segmentEnd = ' f '

senators = ['Mr. SESSIONS', 'Mr. STRANGE', 'Mr. JONES', 'Mr. SHELBY', 'Mr. SULLIVAN', 'Ms. MURKOWSKI',
'Mr. FLAKE', 'Mr. MCCAIN', 'Mr. KYL', 'Mr. COTTON', 'Mr. BOOZMAN', 'Mrs. FEINSTEIN', 'Ms. HARRIS',
'Mr. GARDNER', 'Mr. BENNET', 'Mr. MURPHY', 'Mr. BLUMENTHAL', 'Mr. CARPER', 'Mr. COONS', 'Mr. NELSON', 'Mr. RUBIO',
'Mr. PERDUE', 'Mr. ISAKSON', 'Ms. HIRONO', 'Mr. SCHATZ', 'Mr. RISCH', 'Mr. CRAPO', 'Mr. DURBIN',
'Ms. DUCKWORTH', 'Mr. DONNELLY', 'Mr. YOUNG', 'Mrs. ERNST', 'Mr. GRASSLEY', 'Mr. ROBERTS', 'Mr. MORAN',
'Mr. MCCONNELL', 'Mr. PAUL', 'Mr. CASSIDY', 'Mr. KENNEDY', 'Mr. KING', 'Ms. COLLINS', 'Mr. CARDIN',
'Mr. VAN HOLLEN', 'Ms. WARREN', 'Mr. MARKEY', 'Ms. STABENOW', 'Mr. PETERS', 'Ms. KLOBUCHAR',
'Mr. FRANKEN', 'Ms. SMITH', 'Mr. WICKER', 'Mr. COCHRAN', 'Mrs. HYDE-SMITH', 'Mrs. MCCASKILL',
'Mr. BLUNT', 'Mr. TESTER', 'Mr. DAINES', 'Mrs. FISCHER', 'Mr. SASSE', 'Mr. HELLER', 'Ms. CORTEZ MASTO',
'Mrs. SHAHEEN', 'Ms. HASSAN', 'Mr. MENENDEZ', 'Mr. BOOKER', 'Mr. HEINRICH', 'Mr. UDALL',
'Mrs. GILLIBRAND', 'Mr. SCHUMER', 'Mr. TILLIS', 'Mr. BURR', 'Ms. HEITKAMP', 'Mr. HOEVEN',
'Mr. BROWN', 'Mr. PORTMAN', 'Mr. INHOFE', 'Mr. LANKFORD', 'Mr. MERKLEY', 'Mr. WYDEN',
'Mr. CASEY', 'Mr. TOOMEY', 'Mr. WHITEHOUSE', 'Mr. REED', 'Mr. GRAHAM',
'Mr. SCOTT', 'Mr. ROUNDS', 'Mr. THUNE', 'Mr. CORKER', 'Mr. ALEXANDER', 'Mr. CRUZ',
'Mr. CORNYN', 'Mr. HATCH', 'Mr. LEE', 'Mr. SANDERS', 'Mr. LEAHY', 'Mr. KAINE',
'Mr. WARNER', 'Ms. CANTWELL', 'Mrs. MURRAY', 'Mr. MANCHIN', 'Mrs. CAPITO',
'Ms. BALDWIN', 'Mr. JOHNSON', 'Mr. BARRASSO', 'Mr. ENZI',
'The ACTING PRESIDENT', 'The PRESIDING OFFICER', 'The VICE PRESIDENT'
'Executive nominations confirmed by']

startDate = '20170106'
endDate = '20170130'
conRecords = downloadConRecords(startDate, endDate)
cleanConRecords = cleanRecords(conRecords, phraseDelete, segmentStart, segmentEnd)
textExport, textDump, speechExport, speakerExport, speakerAnalytics = writeAnalytics(conRecords, senators)

exportCSV(textExport,'textExport')
exportCSV(textDump,'textDump')
exportCSV(speechExport,'speechExport')
exportCSV(speakerExport,'speakerExport')
exportCSV(speakerAnalytics,'speakerAnalytics')








share







New contributor




Sebastian is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.







$endgroup$

















    1












    $begingroup$


    This is built with Python 2.7.15.



    The goal of this script is to count the number of words spoken by each Senator on the floor of Congress between given dates.



    It pulls from the Congressional Record, cuts out the sections not spoken on the floor (eg lists of Amendments) and returns a count of words by senator. The current list of senators is from the 115th Congress, from 2017 to 2018



    # -*- coding: utf-8 -*-
    from requests import get # to make GET request
    import time
    from datetime import date, timedelta
    import os.path
    import textract
    import re
    import csv
    import sys

    #download URLs
    def downloadPDF(url, file_name):
    #get request
    response = get(url)
    content_type = response.headers.get('content-type')

    if content_type == "application/pdf":
    # open in binary mode
    save_path = '/Users/One/Document/Workspace/Projects/Senate/Congressional_Record/'
    path = os.path.join(save_path, file_name)

    with open(path, "wb") as file:
    #write to file
    file.write(response.content)
    return 'SUCCESS'

    #download congressional records and extract text
    def downloadConRecords(startDate, endDate): #YYYYMMDD
    #initialize lists
    congressionalRecords =
    dates =

    #convert args to date-type
    startDate = str(startDate)
    endDate = str(endDate)
    startDate = date(int(startDate[0:4]),int(startDate[4:6]),int(startDate[6:8]))
    endDate = date(int(endDate[0:4]),int(endDate[4:6]),int(endDate[6:8]))

    #list all the days to check
    range = endDate - startDate
    i=0
    while i <= range.days:
    dates += [str(startDate + timedelta(days=i))]
    i+=1

    #try URL, download PDF, add to list object
    for i in dates:
    YYYY = i[0:4]
    MM = i[5:7]
    DD = i[8:10]
    url = ('https://www.congress.gov/crec/' + YYYY + '/'+ MM + '/' + DD + '/CREC-' + YYYY + '-' + MM + '-' + DD + '-senate.pdf')
    filename = (YYYY+MM+DD+".pdf")

    time.sleep(20)

    if downloadPDF(url,filename) == "SUCCESS":
    print(url)
    textExtract =
    textExtract = textract.process('/Users/One/Document/Workspace/Projects/Senate/Congressional_Record/' + filename)
    #makes a list with the filename and the text
    entry = [url, filename[:-4].strip(), textExtract]
    congressionalRecords += [entry]

    return congressionalRecords

    #process the congressional records
    def cleanRecords(congressionalRecords, phraseDelete, segmentStart, segmentEnd):
    #eliminate n/, phrases, segments
    for record in congressionalRecords:
    clean =
    dump =
    clean = record[2].split('n')
    clean = " ".join(clean)

    for i in phraseDelete:
    clean = clean.replace(i,'')

    for start in enumerate(segmentStart):
    erase = re.findall('{}.*?{}'.format(start[1], segmentEnd), clean)
    clean = re.sub('{}.*?{}'.format(start[1], segmentEnd), ' ', clean)
    dump += [erase]

    record += [clean]
    record += [dump]

    return congressionalRecords

    #split the text into sections based on the speaker
    def multiDelimStringSplitter(aString, separators):
    # separators is an array of strings that are being used to split the the string.
    # sort separators in order of descending length
    separators.sort(key=len)
    listToReturn =
    rows =
    l =
    i = 0

    while i < len(aString):
    theSeparator = ""
    for current in separators:
    if current == aString[i:i+len(current)]:
    #if this section is the separateor, then set the variable to that separator
    theSeparator = current
    if theSeparator != "":
    listToReturn += [theSeparator]
    i = i + len(theSeparator)
    else:
    if listToReturn == :
    listToReturn = [""]
    if(listToReturn[-1] in separators):
    listToReturn += [""]
    #adds the text one character at a time
    listToReturn[-1] += aString[i]
    i += 1

    i = 0
    listToReturn = listToReturn[1:len(listToReturn)]

    while (i+1) < len(listToReturn):
    #initialize entry with speaker and text
    entry = listToReturn[i:i+2]
    #get word count and add it to the entry
    text = listToReturn[i+1]
    #strip punctuation out of the string
    text = re.sub(r'[^ws]','',text)
    wordcount = str(len(text.split()))
    entry += [wordcount]
    #add the entry as its own list
    rows.append(entry)
    i += 2

    return rows

    #get information on the records, speakers and word counts
    def writeAnalytics(congressionalRecords, senators):
    textExport = [['url','date','extractRaw','extractClean']]
    textDump = [['date','extractDump']]
    speechExport = [['date','speaker','text','wordCount']]
    speakerExport = [['date','speaker','wordCount']]
    speakerAnalytics = [['startDate', 'endDate', 'speaker', 'wordCount']]
    perSpeakerTotal =
    recordDates =

    for i in senators:
    # wordsperSpeech += [i,0]
    perSpeakerTotal += [i,0]

    #add delimiter separated text extracts
    for record in congressionalRecords:
    splitLines =
    wordsperSpeaker =

    #initialize the lists
    for i in senators:
    # wordsperSpeech += [i,0]
    wordsperSpeaker += [i,0]

    #split the extracts into each speech
    splitLines = multiDelimStringSplitter(record[3],senators)
    record += [splitLines]

    #prep the dump export
    for i in record[4]:
    if len(i) > 0:
    textDump += [[record[1], i]]

    #add the words per speaker
    for speech in record[5]:
    speechExport += [[record[1],speech[0],speech[1],speech[2]]]
    #find the speaker -> pull that person's wordsperSpeaker and add the speech's word count
    wordsperSpeaker[wordsperSpeaker.index(speech[0])+1] += int(speech[2])

    record += [wordsperSpeaker]
    textExport += [[record[0],record[1],record[2],record[3]]]
    recordDates += [record[1]]

    #prepare export for the words per speaker
    for i in senators:
    speakerExport += [[record[1],i, wordsperSpeaker[wordsperSpeaker.index(i)+1]]]
    #add word counts for senators who spoke
    #find senator's index in perSpeaker total -> add the words spoken in that session (record)
    perSpeakerTotal[perSpeakerTotal.index(i)+1] += record[6][record[6].index(i)+1]

    for i in senators:
    speakerAnalytics += [[min(recordDates), max(recordDates), i, perSpeakerTotal[perSpeakerTotal.index(i)+1]]]

    return textExport, textDump, speechExport, speakerExport, speakerAnalytics

    #export to CSV
    def exportCSV(flatList, filename):
    with open(filename+".csv", "wb") as f:
    writer = csv.writer(f)
    writer.writerows(flatList)

    phraseDelete = [
    ',',
    'This ‘‘bullet’’ symbol identifies statements or insertions which are not spoken by a Member of the Senate on the floor.'
    ]

    segmentStart = [
    ' PRAYER ',
    ' PLEDGE OF ALLEGIANCE ',
    ' APPOINTMENT OF ACTING PRESIDENT PRO TEMPORE ',
    ' CERTIFICATES OF ELECTION ',
    ' MESSAGE FROM THE HOUSE ',
    ' MESSAGES FROM THE PRESIDENT ',
    ' EXECUTIVE MESSAGES REFERRED ',
    ' LIST OF SENATORS BY STATES ',
    ' PRESIDENTIAL MESSAGES ',
    ' ENROLLED BILLS PRESENTED ',
    ' MEASURES REFERRED ',
    ' EXECUTIVE AND OTHER COMMUNICATIONS ',
    ' REPORTS OF COMMITTEES ',
    ' EXECUTIVE REPORTS OF COMMITTEES ',
    ' AMENDMENTS SUBMITTED AND PROPOSED ',
    ' INTRODUCTION OF BILLS AND JOINT RESOLUTIONS ',
    ' ADDITIONAL COSPONSORS ',
    ' SUBMITTED RESOLUTIONS ',
    ' SUBMISSION OF CONCURRENT AND SENATE RESOLUTIONS ',
    ' SENATE RESOLUTION ',
    ' TEXT OF AMENDMENTS ',
    ' APPOINTMENT ',
    ' AUTHORITY FOR COMMITTEES TO MEET ',
    ' CONFIRMATION ',
    ' NOMINATION '
    ]

    segmentEnd = ' f '

    senators = ['Mr. SESSIONS', 'Mr. STRANGE', 'Mr. JONES', 'Mr. SHELBY', 'Mr. SULLIVAN', 'Ms. MURKOWSKI',
    'Mr. FLAKE', 'Mr. MCCAIN', 'Mr. KYL', 'Mr. COTTON', 'Mr. BOOZMAN', 'Mrs. FEINSTEIN', 'Ms. HARRIS',
    'Mr. GARDNER', 'Mr. BENNET', 'Mr. MURPHY', 'Mr. BLUMENTHAL', 'Mr. CARPER', 'Mr. COONS', 'Mr. NELSON', 'Mr. RUBIO',
    'Mr. PERDUE', 'Mr. ISAKSON', 'Ms. HIRONO', 'Mr. SCHATZ', 'Mr. RISCH', 'Mr. CRAPO', 'Mr. DURBIN',
    'Ms. DUCKWORTH', 'Mr. DONNELLY', 'Mr. YOUNG', 'Mrs. ERNST', 'Mr. GRASSLEY', 'Mr. ROBERTS', 'Mr. MORAN',
    'Mr. MCCONNELL', 'Mr. PAUL', 'Mr. CASSIDY', 'Mr. KENNEDY', 'Mr. KING', 'Ms. COLLINS', 'Mr. CARDIN',
    'Mr. VAN HOLLEN', 'Ms. WARREN', 'Mr. MARKEY', 'Ms. STABENOW', 'Mr. PETERS', 'Ms. KLOBUCHAR',
    'Mr. FRANKEN', 'Ms. SMITH', 'Mr. WICKER', 'Mr. COCHRAN', 'Mrs. HYDE-SMITH', 'Mrs. MCCASKILL',
    'Mr. BLUNT', 'Mr. TESTER', 'Mr. DAINES', 'Mrs. FISCHER', 'Mr. SASSE', 'Mr. HELLER', 'Ms. CORTEZ MASTO',
    'Mrs. SHAHEEN', 'Ms. HASSAN', 'Mr. MENENDEZ', 'Mr. BOOKER', 'Mr. HEINRICH', 'Mr. UDALL',
    'Mrs. GILLIBRAND', 'Mr. SCHUMER', 'Mr. TILLIS', 'Mr. BURR', 'Ms. HEITKAMP', 'Mr. HOEVEN',
    'Mr. BROWN', 'Mr. PORTMAN', 'Mr. INHOFE', 'Mr. LANKFORD', 'Mr. MERKLEY', 'Mr. WYDEN',
    'Mr. CASEY', 'Mr. TOOMEY', 'Mr. WHITEHOUSE', 'Mr. REED', 'Mr. GRAHAM',
    'Mr. SCOTT', 'Mr. ROUNDS', 'Mr. THUNE', 'Mr. CORKER', 'Mr. ALEXANDER', 'Mr. CRUZ',
    'Mr. CORNYN', 'Mr. HATCH', 'Mr. LEE', 'Mr. SANDERS', 'Mr. LEAHY', 'Mr. KAINE',
    'Mr. WARNER', 'Ms. CANTWELL', 'Mrs. MURRAY', 'Mr. MANCHIN', 'Mrs. CAPITO',
    'Ms. BALDWIN', 'Mr. JOHNSON', 'Mr. BARRASSO', 'Mr. ENZI',
    'The ACTING PRESIDENT', 'The PRESIDING OFFICER', 'The VICE PRESIDENT'
    'Executive nominations confirmed by']

    startDate = '20170106'
    endDate = '20170130'
    conRecords = downloadConRecords(startDate, endDate)
    cleanConRecords = cleanRecords(conRecords, phraseDelete, segmentStart, segmentEnd)
    textExport, textDump, speechExport, speakerExport, speakerAnalytics = writeAnalytics(conRecords, senators)

    exportCSV(textExport,'textExport')
    exportCSV(textDump,'textDump')
    exportCSV(speechExport,'speechExport')
    exportCSV(speakerExport,'speakerExport')
    exportCSV(speakerAnalytics,'speakerAnalytics')








    share







    New contributor




    Sebastian is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
    Check out our Code of Conduct.







    $endgroup$















      1












      1








      1





      $begingroup$


      This is built with Python 2.7.15.



      The goal of this script is to count the number of words spoken by each Senator on the floor of Congress between given dates.



      It pulls from the Congressional Record, cuts out the sections not spoken on the floor (eg lists of Amendments) and returns a count of words by senator. The current list of senators is from the 115th Congress, from 2017 to 2018



      # -*- coding: utf-8 -*-
      from requests import get # to make GET request
      import time
      from datetime import date, timedelta
      import os.path
      import textract
      import re
      import csv
      import sys

      #download URLs
      def downloadPDF(url, file_name):
      #get request
      response = get(url)
      content_type = response.headers.get('content-type')

      if content_type == "application/pdf":
      # open in binary mode
      save_path = '/Users/One/Document/Workspace/Projects/Senate/Congressional_Record/'
      path = os.path.join(save_path, file_name)

      with open(path, "wb") as file:
      #write to file
      file.write(response.content)
      return 'SUCCESS'

      #download congressional records and extract text
      def downloadConRecords(startDate, endDate): #YYYYMMDD
      #initialize lists
      congressionalRecords =
      dates =

      #convert args to date-type
      startDate = str(startDate)
      endDate = str(endDate)
      startDate = date(int(startDate[0:4]),int(startDate[4:6]),int(startDate[6:8]))
      endDate = date(int(endDate[0:4]),int(endDate[4:6]),int(endDate[6:8]))

      #list all the days to check
      range = endDate - startDate
      i=0
      while i <= range.days:
      dates += [str(startDate + timedelta(days=i))]
      i+=1

      #try URL, download PDF, add to list object
      for i in dates:
      YYYY = i[0:4]
      MM = i[5:7]
      DD = i[8:10]
      url = ('https://www.congress.gov/crec/' + YYYY + '/'+ MM + '/' + DD + '/CREC-' + YYYY + '-' + MM + '-' + DD + '-senate.pdf')
      filename = (YYYY+MM+DD+".pdf")

      time.sleep(20)

      if downloadPDF(url,filename) == "SUCCESS":
      print(url)
      textExtract =
      textExtract = textract.process('/Users/One/Document/Workspace/Projects/Senate/Congressional_Record/' + filename)
      #makes a list with the filename and the text
      entry = [url, filename[:-4].strip(), textExtract]
      congressionalRecords += [entry]

      return congressionalRecords

      #process the congressional records
      def cleanRecords(congressionalRecords, phraseDelete, segmentStart, segmentEnd):
      #eliminate n/, phrases, segments
      for record in congressionalRecords:
      clean =
      dump =
      clean = record[2].split('n')
      clean = " ".join(clean)

      for i in phraseDelete:
      clean = clean.replace(i,'')

      for start in enumerate(segmentStart):
      erase = re.findall('{}.*?{}'.format(start[1], segmentEnd), clean)
      clean = re.sub('{}.*?{}'.format(start[1], segmentEnd), ' ', clean)
      dump += [erase]

      record += [clean]
      record += [dump]

      return congressionalRecords

      #split the text into sections based on the speaker
      def multiDelimStringSplitter(aString, separators):
      # separators is an array of strings that are being used to split the the string.
      # sort separators in order of descending length
      separators.sort(key=len)
      listToReturn =
      rows =
      l =
      i = 0

      while i < len(aString):
      theSeparator = ""
      for current in separators:
      if current == aString[i:i+len(current)]:
      #if this section is the separateor, then set the variable to that separator
      theSeparator = current
      if theSeparator != "":
      listToReturn += [theSeparator]
      i = i + len(theSeparator)
      else:
      if listToReturn == :
      listToReturn = [""]
      if(listToReturn[-1] in separators):
      listToReturn += [""]
      #adds the text one character at a time
      listToReturn[-1] += aString[i]
      i += 1

      i = 0
      listToReturn = listToReturn[1:len(listToReturn)]

      while (i+1) < len(listToReturn):
      #initialize entry with speaker and text
      entry = listToReturn[i:i+2]
      #get word count and add it to the entry
      text = listToReturn[i+1]
      #strip punctuation out of the string
      text = re.sub(r'[^ws]','',text)
      wordcount = str(len(text.split()))
      entry += [wordcount]
      #add the entry as its own list
      rows.append(entry)
      i += 2

      return rows

      #get information on the records, speakers and word counts
      def writeAnalytics(congressionalRecords, senators):
      textExport = [['url','date','extractRaw','extractClean']]
      textDump = [['date','extractDump']]
      speechExport = [['date','speaker','text','wordCount']]
      speakerExport = [['date','speaker','wordCount']]
      speakerAnalytics = [['startDate', 'endDate', 'speaker', 'wordCount']]
      perSpeakerTotal =
      recordDates =

      for i in senators:
      # wordsperSpeech += [i,0]
      perSpeakerTotal += [i,0]

      #add delimiter separated text extracts
      for record in congressionalRecords:
      splitLines =
      wordsperSpeaker =

      #initialize the lists
      for i in senators:
      # wordsperSpeech += [i,0]
      wordsperSpeaker += [i,0]

      #split the extracts into each speech
      splitLines = multiDelimStringSplitter(record[3],senators)
      record += [splitLines]

      #prep the dump export
      for i in record[4]:
      if len(i) > 0:
      textDump += [[record[1], i]]

      #add the words per speaker
      for speech in record[5]:
      speechExport += [[record[1],speech[0],speech[1],speech[2]]]
      #find the speaker -> pull that person's wordsperSpeaker and add the speech's word count
      wordsperSpeaker[wordsperSpeaker.index(speech[0])+1] += int(speech[2])

      record += [wordsperSpeaker]
      textExport += [[record[0],record[1],record[2],record[3]]]
      recordDates += [record[1]]

      #prepare export for the words per speaker
      for i in senators:
      speakerExport += [[record[1],i, wordsperSpeaker[wordsperSpeaker.index(i)+1]]]
      #add word counts for senators who spoke
      #find senator's index in perSpeaker total -> add the words spoken in that session (record)
      perSpeakerTotal[perSpeakerTotal.index(i)+1] += record[6][record[6].index(i)+1]

      for i in senators:
      speakerAnalytics += [[min(recordDates), max(recordDates), i, perSpeakerTotal[perSpeakerTotal.index(i)+1]]]

      return textExport, textDump, speechExport, speakerExport, speakerAnalytics

      #export to CSV
      def exportCSV(flatList, filename):
      with open(filename+".csv", "wb") as f:
      writer = csv.writer(f)
      writer.writerows(flatList)

      phraseDelete = [
      ',',
      'This ‘‘bullet’’ symbol identifies statements or insertions which are not spoken by a Member of the Senate on the floor.'
      ]

      segmentStart = [
      ' PRAYER ',
      ' PLEDGE OF ALLEGIANCE ',
      ' APPOINTMENT OF ACTING PRESIDENT PRO TEMPORE ',
      ' CERTIFICATES OF ELECTION ',
      ' MESSAGE FROM THE HOUSE ',
      ' MESSAGES FROM THE PRESIDENT ',
      ' EXECUTIVE MESSAGES REFERRED ',
      ' LIST OF SENATORS BY STATES ',
      ' PRESIDENTIAL MESSAGES ',
      ' ENROLLED BILLS PRESENTED ',
      ' MEASURES REFERRED ',
      ' EXECUTIVE AND OTHER COMMUNICATIONS ',
      ' REPORTS OF COMMITTEES ',
      ' EXECUTIVE REPORTS OF COMMITTEES ',
      ' AMENDMENTS SUBMITTED AND PROPOSED ',
      ' INTRODUCTION OF BILLS AND JOINT RESOLUTIONS ',
      ' ADDITIONAL COSPONSORS ',
      ' SUBMITTED RESOLUTIONS ',
      ' SUBMISSION OF CONCURRENT AND SENATE RESOLUTIONS ',
      ' SENATE RESOLUTION ',
      ' TEXT OF AMENDMENTS ',
      ' APPOINTMENT ',
      ' AUTHORITY FOR COMMITTEES TO MEET ',
      ' CONFIRMATION ',
      ' NOMINATION '
      ]

      segmentEnd = ' f '

      senators = ['Mr. SESSIONS', 'Mr. STRANGE', 'Mr. JONES', 'Mr. SHELBY', 'Mr. SULLIVAN', 'Ms. MURKOWSKI',
      'Mr. FLAKE', 'Mr. MCCAIN', 'Mr. KYL', 'Mr. COTTON', 'Mr. BOOZMAN', 'Mrs. FEINSTEIN', 'Ms. HARRIS',
      'Mr. GARDNER', 'Mr. BENNET', 'Mr. MURPHY', 'Mr. BLUMENTHAL', 'Mr. CARPER', 'Mr. COONS', 'Mr. NELSON', 'Mr. RUBIO',
      'Mr. PERDUE', 'Mr. ISAKSON', 'Ms. HIRONO', 'Mr. SCHATZ', 'Mr. RISCH', 'Mr. CRAPO', 'Mr. DURBIN',
      'Ms. DUCKWORTH', 'Mr. DONNELLY', 'Mr. YOUNG', 'Mrs. ERNST', 'Mr. GRASSLEY', 'Mr. ROBERTS', 'Mr. MORAN',
      'Mr. MCCONNELL', 'Mr. PAUL', 'Mr. CASSIDY', 'Mr. KENNEDY', 'Mr. KING', 'Ms. COLLINS', 'Mr. CARDIN',
      'Mr. VAN HOLLEN', 'Ms. WARREN', 'Mr. MARKEY', 'Ms. STABENOW', 'Mr. PETERS', 'Ms. KLOBUCHAR',
      'Mr. FRANKEN', 'Ms. SMITH', 'Mr. WICKER', 'Mr. COCHRAN', 'Mrs. HYDE-SMITH', 'Mrs. MCCASKILL',
      'Mr. BLUNT', 'Mr. TESTER', 'Mr. DAINES', 'Mrs. FISCHER', 'Mr. SASSE', 'Mr. HELLER', 'Ms. CORTEZ MASTO',
      'Mrs. SHAHEEN', 'Ms. HASSAN', 'Mr. MENENDEZ', 'Mr. BOOKER', 'Mr. HEINRICH', 'Mr. UDALL',
      'Mrs. GILLIBRAND', 'Mr. SCHUMER', 'Mr. TILLIS', 'Mr. BURR', 'Ms. HEITKAMP', 'Mr. HOEVEN',
      'Mr. BROWN', 'Mr. PORTMAN', 'Mr. INHOFE', 'Mr. LANKFORD', 'Mr. MERKLEY', 'Mr. WYDEN',
      'Mr. CASEY', 'Mr. TOOMEY', 'Mr. WHITEHOUSE', 'Mr. REED', 'Mr. GRAHAM',
      'Mr. SCOTT', 'Mr. ROUNDS', 'Mr. THUNE', 'Mr. CORKER', 'Mr. ALEXANDER', 'Mr. CRUZ',
      'Mr. CORNYN', 'Mr. HATCH', 'Mr. LEE', 'Mr. SANDERS', 'Mr. LEAHY', 'Mr. KAINE',
      'Mr. WARNER', 'Ms. CANTWELL', 'Mrs. MURRAY', 'Mr. MANCHIN', 'Mrs. CAPITO',
      'Ms. BALDWIN', 'Mr. JOHNSON', 'Mr. BARRASSO', 'Mr. ENZI',
      'The ACTING PRESIDENT', 'The PRESIDING OFFICER', 'The VICE PRESIDENT'
      'Executive nominations confirmed by']

      startDate = '20170106'
      endDate = '20170130'
      conRecords = downloadConRecords(startDate, endDate)
      cleanConRecords = cleanRecords(conRecords, phraseDelete, segmentStart, segmentEnd)
      textExport, textDump, speechExport, speakerExport, speakerAnalytics = writeAnalytics(conRecords, senators)

      exportCSV(textExport,'textExport')
      exportCSV(textDump,'textDump')
      exportCSV(speechExport,'speechExport')
      exportCSV(speakerExport,'speakerExport')
      exportCSV(speakerAnalytics,'speakerAnalytics')








      share







      New contributor




      Sebastian is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.







      $endgroup$




      This is built with Python 2.7.15.



      The goal of this script is to count the number of words spoken by each Senator on the floor of Congress between given dates.



      It pulls from the Congressional Record, cuts out the sections not spoken on the floor (eg lists of Amendments) and returns a count of words by senator. The current list of senators is from the 115th Congress, from 2017 to 2018



      # -*- coding: utf-8 -*-
      from requests import get # to make GET request
      import time
      from datetime import date, timedelta
      import os.path
      import textract
      import re
      import csv
      import sys

      #download URLs
      def downloadPDF(url, file_name):
      #get request
      response = get(url)
      content_type = response.headers.get('content-type')

      if content_type == "application/pdf":
      # open in binary mode
      save_path = '/Users/One/Document/Workspace/Projects/Senate/Congressional_Record/'
      path = os.path.join(save_path, file_name)

      with open(path, "wb") as file:
      #write to file
      file.write(response.content)
      return 'SUCCESS'

      #download congressional records and extract text
      def downloadConRecords(startDate, endDate): #YYYYMMDD
      #initialize lists
      congressionalRecords =
      dates =

      #convert args to date-type
      startDate = str(startDate)
      endDate = str(endDate)
      startDate = date(int(startDate[0:4]),int(startDate[4:6]),int(startDate[6:8]))
      endDate = date(int(endDate[0:4]),int(endDate[4:6]),int(endDate[6:8]))

      #list all the days to check
      range = endDate - startDate
      i=0
      while i <= range.days:
      dates += [str(startDate + timedelta(days=i))]
      i+=1

      #try URL, download PDF, add to list object
      for i in dates:
      YYYY = i[0:4]
      MM = i[5:7]
      DD = i[8:10]
      url = ('https://www.congress.gov/crec/' + YYYY + '/'+ MM + '/' + DD + '/CREC-' + YYYY + '-' + MM + '-' + DD + '-senate.pdf')
      filename = (YYYY+MM+DD+".pdf")

      time.sleep(20)

      if downloadPDF(url,filename) == "SUCCESS":
      print(url)
      textExtract =
      textExtract = textract.process('/Users/One/Document/Workspace/Projects/Senate/Congressional_Record/' + filename)
      #makes a list with the filename and the text
      entry = [url, filename[:-4].strip(), textExtract]
      congressionalRecords += [entry]

      return congressionalRecords

      #process the congressional records
      def cleanRecords(congressionalRecords, phraseDelete, segmentStart, segmentEnd):
      #eliminate n/, phrases, segments
      for record in congressionalRecords:
      clean =
      dump =
      clean = record[2].split('n')
      clean = " ".join(clean)

      for i in phraseDelete:
      clean = clean.replace(i,'')

      for start in enumerate(segmentStart):
      erase = re.findall('{}.*?{}'.format(start[1], segmentEnd), clean)
      clean = re.sub('{}.*?{}'.format(start[1], segmentEnd), ' ', clean)
      dump += [erase]

      record += [clean]
      record += [dump]

      return congressionalRecords

      #split the text into sections based on the speaker
      def multiDelimStringSplitter(aString, separators):
      # separators is an array of strings that are being used to split the the string.
      # sort separators in order of descending length
      separators.sort(key=len)
      listToReturn =
      rows =
      l =
      i = 0

      while i < len(aString):
      theSeparator = ""
      for current in separators:
      if current == aString[i:i+len(current)]:
      #if this section is the separateor, then set the variable to that separator
      theSeparator = current
      if theSeparator != "":
      listToReturn += [theSeparator]
      i = i + len(theSeparator)
      else:
      if listToReturn == :
      listToReturn = [""]
      if(listToReturn[-1] in separators):
      listToReturn += [""]
      #adds the text one character at a time
      listToReturn[-1] += aString[i]
      i += 1

      i = 0
      listToReturn = listToReturn[1:len(listToReturn)]

      while (i+1) < len(listToReturn):
      #initialize entry with speaker and text
      entry = listToReturn[i:i+2]
      #get word count and add it to the entry
      text = listToReturn[i+1]
      #strip punctuation out of the string
      text = re.sub(r'[^ws]','',text)
      wordcount = str(len(text.split()))
      entry += [wordcount]
      #add the entry as its own list
      rows.append(entry)
      i += 2

      return rows

      #get information on the records, speakers and word counts
      def writeAnalytics(congressionalRecords, senators):
      textExport = [['url','date','extractRaw','extractClean']]
      textDump = [['date','extractDump']]
      speechExport = [['date','speaker','text','wordCount']]
      speakerExport = [['date','speaker','wordCount']]
      speakerAnalytics = [['startDate', 'endDate', 'speaker', 'wordCount']]
      perSpeakerTotal =
      recordDates =

      for i in senators:
      # wordsperSpeech += [i,0]
      perSpeakerTotal += [i,0]

      #add delimiter separated text extracts
      for record in congressionalRecords:
      splitLines =
      wordsperSpeaker =

      #initialize the lists
      for i in senators:
      # wordsperSpeech += [i,0]
      wordsperSpeaker += [i,0]

      #split the extracts into each speech
      splitLines = multiDelimStringSplitter(record[3],senators)
      record += [splitLines]

      #prep the dump export
      for i in record[4]:
      if len(i) > 0:
      textDump += [[record[1], i]]

      #add the words per speaker
      for speech in record[5]:
      speechExport += [[record[1],speech[0],speech[1],speech[2]]]
      #find the speaker -> pull that person's wordsperSpeaker and add the speech's word count
      wordsperSpeaker[wordsperSpeaker.index(speech[0])+1] += int(speech[2])

      record += [wordsperSpeaker]
      textExport += [[record[0],record[1],record[2],record[3]]]
      recordDates += [record[1]]

      #prepare export for the words per speaker
      for i in senators:
      speakerExport += [[record[1],i, wordsperSpeaker[wordsperSpeaker.index(i)+1]]]
      #add word counts for senators who spoke
      #find senator's index in perSpeaker total -> add the words spoken in that session (record)
      perSpeakerTotal[perSpeakerTotal.index(i)+1] += record[6][record[6].index(i)+1]

      for i in senators:
      speakerAnalytics += [[min(recordDates), max(recordDates), i, perSpeakerTotal[perSpeakerTotal.index(i)+1]]]

      return textExport, textDump, speechExport, speakerExport, speakerAnalytics

      #export to CSV
      def exportCSV(flatList, filename):
      with open(filename+".csv", "wb") as f:
      writer = csv.writer(f)
      writer.writerows(flatList)

      phraseDelete = [
      ',',
      'This ‘‘bullet’’ symbol identifies statements or insertions which are not spoken by a Member of the Senate on the floor.'
      ]

      segmentStart = [
      ' PRAYER ',
      ' PLEDGE OF ALLEGIANCE ',
      ' APPOINTMENT OF ACTING PRESIDENT PRO TEMPORE ',
      ' CERTIFICATES OF ELECTION ',
      ' MESSAGE FROM THE HOUSE ',
      ' MESSAGES FROM THE PRESIDENT ',
      ' EXECUTIVE MESSAGES REFERRED ',
      ' LIST OF SENATORS BY STATES ',
      ' PRESIDENTIAL MESSAGES ',
      ' ENROLLED BILLS PRESENTED ',
      ' MEASURES REFERRED ',
      ' EXECUTIVE AND OTHER COMMUNICATIONS ',
      ' REPORTS OF COMMITTEES ',
      ' EXECUTIVE REPORTS OF COMMITTEES ',
      ' AMENDMENTS SUBMITTED AND PROPOSED ',
      ' INTRODUCTION OF BILLS AND JOINT RESOLUTIONS ',
      ' ADDITIONAL COSPONSORS ',
      ' SUBMITTED RESOLUTIONS ',
      ' SUBMISSION OF CONCURRENT AND SENATE RESOLUTIONS ',
      ' SENATE RESOLUTION ',
      ' TEXT OF AMENDMENTS ',
      ' APPOINTMENT ',
      ' AUTHORITY FOR COMMITTEES TO MEET ',
      ' CONFIRMATION ',
      ' NOMINATION '
      ]

      segmentEnd = ' f '

      senators = ['Mr. SESSIONS', 'Mr. STRANGE', 'Mr. JONES', 'Mr. SHELBY', 'Mr. SULLIVAN', 'Ms. MURKOWSKI',
      'Mr. FLAKE', 'Mr. MCCAIN', 'Mr. KYL', 'Mr. COTTON', 'Mr. BOOZMAN', 'Mrs. FEINSTEIN', 'Ms. HARRIS',
      'Mr. GARDNER', 'Mr. BENNET', 'Mr. MURPHY', 'Mr. BLUMENTHAL', 'Mr. CARPER', 'Mr. COONS', 'Mr. NELSON', 'Mr. RUBIO',
      'Mr. PERDUE', 'Mr. ISAKSON', 'Ms. HIRONO', 'Mr. SCHATZ', 'Mr. RISCH', 'Mr. CRAPO', 'Mr. DURBIN',
      'Ms. DUCKWORTH', 'Mr. DONNELLY', 'Mr. YOUNG', 'Mrs. ERNST', 'Mr. GRASSLEY', 'Mr. ROBERTS', 'Mr. MORAN',
      'Mr. MCCONNELL', 'Mr. PAUL', 'Mr. CASSIDY', 'Mr. KENNEDY', 'Mr. KING', 'Ms. COLLINS', 'Mr. CARDIN',
      'Mr. VAN HOLLEN', 'Ms. WARREN', 'Mr. MARKEY', 'Ms. STABENOW', 'Mr. PETERS', 'Ms. KLOBUCHAR',
      'Mr. FRANKEN', 'Ms. SMITH', 'Mr. WICKER', 'Mr. COCHRAN', 'Mrs. HYDE-SMITH', 'Mrs. MCCASKILL',
      'Mr. BLUNT', 'Mr. TESTER', 'Mr. DAINES', 'Mrs. FISCHER', 'Mr. SASSE', 'Mr. HELLER', 'Ms. CORTEZ MASTO',
      'Mrs. SHAHEEN', 'Ms. HASSAN', 'Mr. MENENDEZ', 'Mr. BOOKER', 'Mr. HEINRICH', 'Mr. UDALL',
      'Mrs. GILLIBRAND', 'Mr. SCHUMER', 'Mr. TILLIS', 'Mr. BURR', 'Ms. HEITKAMP', 'Mr. HOEVEN',
      'Mr. BROWN', 'Mr. PORTMAN', 'Mr. INHOFE', 'Mr. LANKFORD', 'Mr. MERKLEY', 'Mr. WYDEN',
      'Mr. CASEY', 'Mr. TOOMEY', 'Mr. WHITEHOUSE', 'Mr. REED', 'Mr. GRAHAM',
      'Mr. SCOTT', 'Mr. ROUNDS', 'Mr. THUNE', 'Mr. CORKER', 'Mr. ALEXANDER', 'Mr. CRUZ',
      'Mr. CORNYN', 'Mr. HATCH', 'Mr. LEE', 'Mr. SANDERS', 'Mr. LEAHY', 'Mr. KAINE',
      'Mr. WARNER', 'Ms. CANTWELL', 'Mrs. MURRAY', 'Mr. MANCHIN', 'Mrs. CAPITO',
      'Ms. BALDWIN', 'Mr. JOHNSON', 'Mr. BARRASSO', 'Mr. ENZI',
      'The ACTING PRESIDENT', 'The PRESIDING OFFICER', 'The VICE PRESIDENT'
      'Executive nominations confirmed by']

      startDate = '20170106'
      endDate = '20170130'
      conRecords = downloadConRecords(startDate, endDate)
      cleanConRecords = cleanRecords(conRecords, phraseDelete, segmentStart, segmentEnd)
      textExport, textDump, speechExport, speakerExport, speakerAnalytics = writeAnalytics(conRecords, senators)

      exportCSV(textExport,'textExport')
      exportCSV(textDump,'textDump')
      exportCSV(speechExport,'speechExport')
      exportCSV(speakerExport,'speakerExport')
      exportCSV(speakerAnalytics,'speakerAnalytics')






      python beginner





      share







      New contributor




      Sebastian is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.










      share







      New contributor




      Sebastian is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.








      share



      share






      New contributor




      Sebastian is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.









      asked 4 mins ago









      SebastianSebastian

      6




      6




      New contributor




      Sebastian is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.





      New contributor





      Sebastian is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.






      Sebastian is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
      Check out our Code of Conduct.






















          0






          active

          oldest

          votes











          Your Answer





          StackExchange.ifUsing("editor", function () {
          return StackExchange.using("mathjaxEditing", function () {
          StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
          StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
          });
          });
          }, "mathjax-editing");

          StackExchange.ifUsing("editor", function () {
          StackExchange.using("externalEditor", function () {
          StackExchange.using("snippets", function () {
          StackExchange.snippets.init();
          });
          });
          }, "code-snippets");

          StackExchange.ready(function() {
          var channelOptions = {
          tags: "".split(" "),
          id: "196"
          };
          initTagRenderer("".split(" "), "".split(" "), channelOptions);

          StackExchange.using("externalEditor", function() {
          // Have to fire editor after snippets, if snippets enabled
          if (StackExchange.settings.snippets.snippetsEnabled) {
          StackExchange.using("snippets", function() {
          createEditor();
          });
          }
          else {
          createEditor();
          }
          });

          function createEditor() {
          StackExchange.prepareEditor({
          heartbeatType: 'answer',
          autoActivateHeartbeat: false,
          convertImagesToLinks: false,
          noModals: true,
          showLowRepImageUploadWarning: true,
          reputationToPostImages: null,
          bindNavPrevention: true,
          postfix: "",
          imageUploader: {
          brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
          contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
          allowUrls: true
          },
          onDemand: true,
          discardSelector: ".discard-answer"
          ,immediatelyShowMarkdownHelp:true
          });


          }
          });






          Sebastian is a new contributor. Be nice, and check out our Code of Conduct.










          draft saved

          draft discarded


















          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f212599%2fpython-script-to-pull-text-data%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown

























          0






          active

          oldest

          votes








          0






          active

          oldest

          votes









          active

          oldest

          votes






          active

          oldest

          votes








          Sebastian is a new contributor. Be nice, and check out our Code of Conduct.










          draft saved

          draft discarded


















          Sebastian is a new contributor. Be nice, and check out our Code of Conduct.













          Sebastian is a new contributor. Be nice, and check out our Code of Conduct.












          Sebastian is a new contributor. Be nice, and check out our Code of Conduct.
















          Thanks for contributing an answer to Code Review Stack Exchange!


          • Please be sure to answer the question. Provide details and share your research!

          But avoid



          • Asking for help, clarification, or responding to other answers.

          • Making statements based on opinion; back them up with references or personal experience.


          Use MathJax to format equations. MathJax reference.


          To learn more, see our tips on writing great answers.




          draft saved


          draft discarded














          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f212599%2fpython-script-to-pull-text-data%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown





















































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown

































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown







          Popular posts from this blog

          404 Error Contact Form 7 ajax form submitting

          How to know if a Active Directory user can login interactively

          TypeError: fit_transform() missing 1 required positional argument: 'X'