A simple web scraper











up vote
1
down vote

favorite
1












I've written a simple web scraper in Python. As it is my first venture in Python, I'd like to ask for a code review from more experienced coders. All input is welcome.



A little bit of background:



I'm a huge fan of video games. In Japan, there is a company called Media Create which tracks all sales of games and consoles and releases them weekly (in Japanese language). Another site (www.gematsu.com) translates those charts and releases them weekly on their website. My script fetches the data and saves it to a text file.



The script should run as-is, but it is dependent on a text file of the previous week existing in script-location/data.



The name of the script needs to be something like media-create-sales-11-26-18-12-2-18.txt (replacing the start and end dates with any dates other than the dates of the most recently released data).



I am not affiliated with Media Create or Gematsu and this script serves no commercial purpose.



import requests
from bs4 import BeautifulSoup
import datetime
import glob
import os

def getLatestFilename():

list_of_files = glob.glob('data/*')
latest_file = max(list_of_files, key=os.path.getctime)

return latest_file

def parseUrl(dateString):

dateString = dateString.split('-')

lastdateString = str("-".join(dateString[6:9])).split('.')[0]

lastdate_object = datetime.datetime.strptime(lastdateString, "%m-%d-%y")

current_startdatetime_object = lastdate_object + datetime.timedelta(days=1)
current_enddatetime_object = lastdate_object + datetime.timedelta(days=7)

cur_startDateString = current_startdatetime_object.strftime("%m-%d-%y").lstrip("0").replace("0", "")
cur_endDateString = current_enddatetime_object.strftime("%m-%d-%y").lstrip("0").replace("0", "")

return (cur_startDateString + '-' + cur_endDateString)

def getUrl(filename):

curIntervalString = parseUrl(filename)

now = datetime.datetime.now()

url = list()
url.append("https://gematsu.com/")
url.append(str(now.year))
url.append('/')
url.append(str(now.month))
url.append('/')
url.append('media-create-sales-')
url.append(curIntervalString)

return url

def fetchContentForUrl(url):

page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

outputfile = open("data/media" + url.split('media')[1] + ".txt", "w")

outputfile.write(url + 'n' + 'n')

content = soup.find(class_='entry')

gameCharts = content.findNext('ol')
gameEntries = gameCharts.find_all('li')

for idx,entry in enumerate(gameEntries,start=1):
outputfile.write(str(idx) + "." + entry.get_text() + "n")

print ('n')

consoleCharts = gameCharts.find_next_sibling("ol")
consoleEntries = consoleCharts.find_all('li')

for entry in consoleEntries:
outputfile.write(entry.get_text() + "n")

def main():
latestFilename = getLatestFilename()
print (latestFilename)
url = ''.join(getUrl(latestFilename))
print (url)
fetchContentForUrl(url)

if __name__ == '__main__':
main()









share|improve this question


























    up vote
    1
    down vote

    favorite
    1












    I've written a simple web scraper in Python. As it is my first venture in Python, I'd like to ask for a code review from more experienced coders. All input is welcome.



    A little bit of background:



    I'm a huge fan of video games. In Japan, there is a company called Media Create which tracks all sales of games and consoles and releases them weekly (in Japanese language). Another site (www.gematsu.com) translates those charts and releases them weekly on their website. My script fetches the data and saves it to a text file.



    The script should run as-is, but it is dependent on a text file of the previous week existing in script-location/data.



    The name of the script needs to be something like media-create-sales-11-26-18-12-2-18.txt (replacing the start and end dates with any dates other than the dates of the most recently released data).



    I am not affiliated with Media Create or Gematsu and this script serves no commercial purpose.



    import requests
    from bs4 import BeautifulSoup
    import datetime
    import glob
    import os

    def getLatestFilename():

    list_of_files = glob.glob('data/*')
    latest_file = max(list_of_files, key=os.path.getctime)

    return latest_file

    def parseUrl(dateString):

    dateString = dateString.split('-')

    lastdateString = str("-".join(dateString[6:9])).split('.')[0]

    lastdate_object = datetime.datetime.strptime(lastdateString, "%m-%d-%y")

    current_startdatetime_object = lastdate_object + datetime.timedelta(days=1)
    current_enddatetime_object = lastdate_object + datetime.timedelta(days=7)

    cur_startDateString = current_startdatetime_object.strftime("%m-%d-%y").lstrip("0").replace("0", "")
    cur_endDateString = current_enddatetime_object.strftime("%m-%d-%y").lstrip("0").replace("0", "")

    return (cur_startDateString + '-' + cur_endDateString)

    def getUrl(filename):

    curIntervalString = parseUrl(filename)

    now = datetime.datetime.now()

    url = list()
    url.append("https://gematsu.com/")
    url.append(str(now.year))
    url.append('/')
    url.append(str(now.month))
    url.append('/')
    url.append('media-create-sales-')
    url.append(curIntervalString)

    return url

    def fetchContentForUrl(url):

    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')

    outputfile = open("data/media" + url.split('media')[1] + ".txt", "w")

    outputfile.write(url + 'n' + 'n')

    content = soup.find(class_='entry')

    gameCharts = content.findNext('ol')
    gameEntries = gameCharts.find_all('li')

    for idx,entry in enumerate(gameEntries,start=1):
    outputfile.write(str(idx) + "." + entry.get_text() + "n")

    print ('n')

    consoleCharts = gameCharts.find_next_sibling("ol")
    consoleEntries = consoleCharts.find_all('li')

    for entry in consoleEntries:
    outputfile.write(entry.get_text() + "n")

    def main():
    latestFilename = getLatestFilename()
    print (latestFilename)
    url = ''.join(getUrl(latestFilename))
    print (url)
    fetchContentForUrl(url)

    if __name__ == '__main__':
    main()









    share|improve this question
























      up vote
      1
      down vote

      favorite
      1









      up vote
      1
      down vote

      favorite
      1






      1





      I've written a simple web scraper in Python. As it is my first venture in Python, I'd like to ask for a code review from more experienced coders. All input is welcome.



      A little bit of background:



      I'm a huge fan of video games. In Japan, there is a company called Media Create which tracks all sales of games and consoles and releases them weekly (in Japanese language). Another site (www.gematsu.com) translates those charts and releases them weekly on their website. My script fetches the data and saves it to a text file.



      The script should run as-is, but it is dependent on a text file of the previous week existing in script-location/data.



      The name of the script needs to be something like media-create-sales-11-26-18-12-2-18.txt (replacing the start and end dates with any dates other than the dates of the most recently released data).



      I am not affiliated with Media Create or Gematsu and this script serves no commercial purpose.



      import requests
      from bs4 import BeautifulSoup
      import datetime
      import glob
      import os

      def getLatestFilename():

      list_of_files = glob.glob('data/*')
      latest_file = max(list_of_files, key=os.path.getctime)

      return latest_file

      def parseUrl(dateString):

      dateString = dateString.split('-')

      lastdateString = str("-".join(dateString[6:9])).split('.')[0]

      lastdate_object = datetime.datetime.strptime(lastdateString, "%m-%d-%y")

      current_startdatetime_object = lastdate_object + datetime.timedelta(days=1)
      current_enddatetime_object = lastdate_object + datetime.timedelta(days=7)

      cur_startDateString = current_startdatetime_object.strftime("%m-%d-%y").lstrip("0").replace("0", "")
      cur_endDateString = current_enddatetime_object.strftime("%m-%d-%y").lstrip("0").replace("0", "")

      return (cur_startDateString + '-' + cur_endDateString)

      def getUrl(filename):

      curIntervalString = parseUrl(filename)

      now = datetime.datetime.now()

      url = list()
      url.append("https://gematsu.com/")
      url.append(str(now.year))
      url.append('/')
      url.append(str(now.month))
      url.append('/')
      url.append('media-create-sales-')
      url.append(curIntervalString)

      return url

      def fetchContentForUrl(url):

      page = requests.get(url)
      soup = BeautifulSoup(page.text, 'html.parser')

      outputfile = open("data/media" + url.split('media')[1] + ".txt", "w")

      outputfile.write(url + 'n' + 'n')

      content = soup.find(class_='entry')

      gameCharts = content.findNext('ol')
      gameEntries = gameCharts.find_all('li')

      for idx,entry in enumerate(gameEntries,start=1):
      outputfile.write(str(idx) + "." + entry.get_text() + "n")

      print ('n')

      consoleCharts = gameCharts.find_next_sibling("ol")
      consoleEntries = consoleCharts.find_all('li')

      for entry in consoleEntries:
      outputfile.write(entry.get_text() + "n")

      def main():
      latestFilename = getLatestFilename()
      print (latestFilename)
      url = ''.join(getUrl(latestFilename))
      print (url)
      fetchContentForUrl(url)

      if __name__ == '__main__':
      main()









      share|improve this question













      I've written a simple web scraper in Python. As it is my first venture in Python, I'd like to ask for a code review from more experienced coders. All input is welcome.



      A little bit of background:



      I'm a huge fan of video games. In Japan, there is a company called Media Create which tracks all sales of games and consoles and releases them weekly (in Japanese language). Another site (www.gematsu.com) translates those charts and releases them weekly on their website. My script fetches the data and saves it to a text file.



      The script should run as-is, but it is dependent on a text file of the previous week existing in script-location/data.



      The name of the script needs to be something like media-create-sales-11-26-18-12-2-18.txt (replacing the start and end dates with any dates other than the dates of the most recently released data).



      I am not affiliated with Media Create or Gematsu and this script serves no commercial purpose.



      import requests
      from bs4 import BeautifulSoup
      import datetime
      import glob
      import os

      def getLatestFilename():

      list_of_files = glob.glob('data/*')
      latest_file = max(list_of_files, key=os.path.getctime)

      return latest_file

      def parseUrl(dateString):

      dateString = dateString.split('-')

      lastdateString = str("-".join(dateString[6:9])).split('.')[0]

      lastdate_object = datetime.datetime.strptime(lastdateString, "%m-%d-%y")

      current_startdatetime_object = lastdate_object + datetime.timedelta(days=1)
      current_enddatetime_object = lastdate_object + datetime.timedelta(days=7)

      cur_startDateString = current_startdatetime_object.strftime("%m-%d-%y").lstrip("0").replace("0", "")
      cur_endDateString = current_enddatetime_object.strftime("%m-%d-%y").lstrip("0").replace("0", "")

      return (cur_startDateString + '-' + cur_endDateString)

      def getUrl(filename):

      curIntervalString = parseUrl(filename)

      now = datetime.datetime.now()

      url = list()
      url.append("https://gematsu.com/")
      url.append(str(now.year))
      url.append('/')
      url.append(str(now.month))
      url.append('/')
      url.append('media-create-sales-')
      url.append(curIntervalString)

      return url

      def fetchContentForUrl(url):

      page = requests.get(url)
      soup = BeautifulSoup(page.text, 'html.parser')

      outputfile = open("data/media" + url.split('media')[1] + ".txt", "w")

      outputfile.write(url + 'n' + 'n')

      content = soup.find(class_='entry')

      gameCharts = content.findNext('ol')
      gameEntries = gameCharts.find_all('li')

      for idx,entry in enumerate(gameEntries,start=1):
      outputfile.write(str(idx) + "." + entry.get_text() + "n")

      print ('n')

      consoleCharts = gameCharts.find_next_sibling("ol")
      consoleEntries = consoleCharts.find_all('li')

      for entry in consoleEntries:
      outputfile.write(entry.get_text() + "n")

      def main():
      latestFilename = getLatestFilename()
      print (latestFilename)
      url = ''.join(getUrl(latestFilename))
      print (url)
      fetchContentForUrl(url)

      if __name__ == '__main__':
      main()






      python web-scraping






      share|improve this question













      share|improve this question











      share|improve this question




      share|improve this question










      asked 1 hour ago









      Fang

      203110




      203110






















          1 Answer
          1






          active

          oldest

          votes

















          up vote
          1
          down vote













          This is a very good attempt for the first Python venture!





          Here is what I would consider improving:





          • stylistic / PEP8





            • naming. In Python, it is recommended to use lower_case_with_underscores notation. Your function and variable names are inconsistent and there is a mix of styles - e.g. cur_startDateString follows both camelCase and lower_case_with_underscores


            • the user of blank lines. In Python, it is recommended to have 2 blank lines between top-level blocks of the code and have an extra new-line at the end of the script


            • the use of whitespaces. Watch for whitespaces after commas in expressions, e.g.:



              for idx,entry in enumerate(gameEntries,start=1):
              ^ ^


            • import grouping and order. See Imports section of the PEP8


            • parenthesis around the return value are redundant

            • consider adding docstrings explaining what your functions are about. For instance, getLatestFilename would benefit from a note about how does it define which file is the latest. And, parseUrl is not really easy to understand just by reading the code

            • if you are using Python 3.5+, consider adding Type Hints




          • Design and modularity




            • it is a good thing that you tried to logically separate scraping from parsing

            • I would also separate parsing and dumping the results into a CSV file




          • HTML Parsing




            • if performance is a concern, consider switching from html.parser to lxml - see "Installing a Parser" section of the documentation


            • you could also improve performance of the parsing stage by using a SoupStrainer which allows you to parse the desired part of the HTML tree in the first place:



              from bs4 import BeautifulSoup, SoupStrainer

              parse_only = SoupStrainer(class_='entry')
              soup = BeautifulSoup(page.text, 'lxml', parse_only=parse_only)


            • .findNext() is deprecated in favour of .find_next()





          • Other





            • getUrl() method feels a little bit overcomplicated and you can definitely improve its readability and simplify it at the same time. Plus, I would make it return the URL as a string and not as a list to avoid any extra external code needed to make it a string. If you are using Python 3.6+, f-strings would be handy:



              def get_url(filename: str) -> str:
              """Constructs a complete URL based on the filename and current date."""
              current_interval = parse_url(filename)

              now = datetime.datetime.now()

              return f"https://gematsu.com/{now.year}/{now.month}/media-create-sales-{current_interval}"


              Having the complete URL string with placeholders helps to see what the end result might look like.






          Note that most of the PEP8 violations mentioned above could be caught by running a linter (like flake8, or pylint) against your code.






          share|improve this answer























            Your Answer





            StackExchange.ifUsing("editor", function () {
            return StackExchange.using("mathjaxEditing", function () {
            StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
            StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
            });
            });
            }, "mathjax-editing");

            StackExchange.ifUsing("editor", function () {
            StackExchange.using("externalEditor", function () {
            StackExchange.using("snippets", function () {
            StackExchange.snippets.init();
            });
            });
            }, "code-snippets");

            StackExchange.ready(function() {
            var channelOptions = {
            tags: "".split(" "),
            id: "196"
            };
            initTagRenderer("".split(" "), "".split(" "), channelOptions);

            StackExchange.using("externalEditor", function() {
            // Have to fire editor after snippets, if snippets enabled
            if (StackExchange.settings.snippets.snippetsEnabled) {
            StackExchange.using("snippets", function() {
            createEditor();
            });
            }
            else {
            createEditor();
            }
            });

            function createEditor() {
            StackExchange.prepareEditor({
            heartbeatType: 'answer',
            convertImagesToLinks: false,
            noModals: true,
            showLowRepImageUploadWarning: true,
            reputationToPostImages: null,
            bindNavPrevention: true,
            postfix: "",
            imageUploader: {
            brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
            contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
            allowUrls: true
            },
            onDemand: true,
            discardSelector: ".discard-answer"
            ,immediatelyShowMarkdownHelp:true
            });


            }
            });














            draft saved

            draft discarded


















            StackExchange.ready(
            function () {
            StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f209575%2fa-simple-web-scraper%23new-answer', 'question_page');
            }
            );

            Post as a guest















            Required, but never shown

























            1 Answer
            1






            active

            oldest

            votes








            1 Answer
            1






            active

            oldest

            votes









            active

            oldest

            votes






            active

            oldest

            votes








            up vote
            1
            down vote













            This is a very good attempt for the first Python venture!





            Here is what I would consider improving:





            • stylistic / PEP8





              • naming. In Python, it is recommended to use lower_case_with_underscores notation. Your function and variable names are inconsistent and there is a mix of styles - e.g. cur_startDateString follows both camelCase and lower_case_with_underscores


              • the user of blank lines. In Python, it is recommended to have 2 blank lines between top-level blocks of the code and have an extra new-line at the end of the script


              • the use of whitespaces. Watch for whitespaces after commas in expressions, e.g.:



                for idx,entry in enumerate(gameEntries,start=1):
                ^ ^


              • import grouping and order. See Imports section of the PEP8


              • parenthesis around the return value are redundant

              • consider adding docstrings explaining what your functions are about. For instance, getLatestFilename would benefit from a note about how does it define which file is the latest. And, parseUrl is not really easy to understand just by reading the code

              • if you are using Python 3.5+, consider adding Type Hints




            • Design and modularity




              • it is a good thing that you tried to logically separate scraping from parsing

              • I would also separate parsing and dumping the results into a CSV file




            • HTML Parsing




              • if performance is a concern, consider switching from html.parser to lxml - see "Installing a Parser" section of the documentation


              • you could also improve performance of the parsing stage by using a SoupStrainer which allows you to parse the desired part of the HTML tree in the first place:



                from bs4 import BeautifulSoup, SoupStrainer

                parse_only = SoupStrainer(class_='entry')
                soup = BeautifulSoup(page.text, 'lxml', parse_only=parse_only)


              • .findNext() is deprecated in favour of .find_next()





            • Other





              • getUrl() method feels a little bit overcomplicated and you can definitely improve its readability and simplify it at the same time. Plus, I would make it return the URL as a string and not as a list to avoid any extra external code needed to make it a string. If you are using Python 3.6+, f-strings would be handy:



                def get_url(filename: str) -> str:
                """Constructs a complete URL based on the filename and current date."""
                current_interval = parse_url(filename)

                now = datetime.datetime.now()

                return f"https://gematsu.com/{now.year}/{now.month}/media-create-sales-{current_interval}"


                Having the complete URL string with placeholders helps to see what the end result might look like.






            Note that most of the PEP8 violations mentioned above could be caught by running a linter (like flake8, or pylint) against your code.






            share|improve this answer



























              up vote
              1
              down vote













              This is a very good attempt for the first Python venture!





              Here is what I would consider improving:





              • stylistic / PEP8





                • naming. In Python, it is recommended to use lower_case_with_underscores notation. Your function and variable names are inconsistent and there is a mix of styles - e.g. cur_startDateString follows both camelCase and lower_case_with_underscores


                • the user of blank lines. In Python, it is recommended to have 2 blank lines between top-level blocks of the code and have an extra new-line at the end of the script


                • the use of whitespaces. Watch for whitespaces after commas in expressions, e.g.:



                  for idx,entry in enumerate(gameEntries,start=1):
                  ^ ^


                • import grouping and order. See Imports section of the PEP8


                • parenthesis around the return value are redundant

                • consider adding docstrings explaining what your functions are about. For instance, getLatestFilename would benefit from a note about how does it define which file is the latest. And, parseUrl is not really easy to understand just by reading the code

                • if you are using Python 3.5+, consider adding Type Hints




              • Design and modularity




                • it is a good thing that you tried to logically separate scraping from parsing

                • I would also separate parsing and dumping the results into a CSV file




              • HTML Parsing




                • if performance is a concern, consider switching from html.parser to lxml - see "Installing a Parser" section of the documentation


                • you could also improve performance of the parsing stage by using a SoupStrainer which allows you to parse the desired part of the HTML tree in the first place:



                  from bs4 import BeautifulSoup, SoupStrainer

                  parse_only = SoupStrainer(class_='entry')
                  soup = BeautifulSoup(page.text, 'lxml', parse_only=parse_only)


                • .findNext() is deprecated in favour of .find_next()





              • Other





                • getUrl() method feels a little bit overcomplicated and you can definitely improve its readability and simplify it at the same time. Plus, I would make it return the URL as a string and not as a list to avoid any extra external code needed to make it a string. If you are using Python 3.6+, f-strings would be handy:



                  def get_url(filename: str) -> str:
                  """Constructs a complete URL based on the filename and current date."""
                  current_interval = parse_url(filename)

                  now = datetime.datetime.now()

                  return f"https://gematsu.com/{now.year}/{now.month}/media-create-sales-{current_interval}"


                  Having the complete URL string with placeholders helps to see what the end result might look like.






              Note that most of the PEP8 violations mentioned above could be caught by running a linter (like flake8, or pylint) against your code.






              share|improve this answer

























                up vote
                1
                down vote










                up vote
                1
                down vote









                This is a very good attempt for the first Python venture!





                Here is what I would consider improving:





                • stylistic / PEP8





                  • naming. In Python, it is recommended to use lower_case_with_underscores notation. Your function and variable names are inconsistent and there is a mix of styles - e.g. cur_startDateString follows both camelCase and lower_case_with_underscores


                  • the user of blank lines. In Python, it is recommended to have 2 blank lines between top-level blocks of the code and have an extra new-line at the end of the script


                  • the use of whitespaces. Watch for whitespaces after commas in expressions, e.g.:



                    for idx,entry in enumerate(gameEntries,start=1):
                    ^ ^


                  • import grouping and order. See Imports section of the PEP8


                  • parenthesis around the return value are redundant

                  • consider adding docstrings explaining what your functions are about. For instance, getLatestFilename would benefit from a note about how does it define which file is the latest. And, parseUrl is not really easy to understand just by reading the code

                  • if you are using Python 3.5+, consider adding Type Hints




                • Design and modularity




                  • it is a good thing that you tried to logically separate scraping from parsing

                  • I would also separate parsing and dumping the results into a CSV file




                • HTML Parsing




                  • if performance is a concern, consider switching from html.parser to lxml - see "Installing a Parser" section of the documentation


                  • you could also improve performance of the parsing stage by using a SoupStrainer which allows you to parse the desired part of the HTML tree in the first place:



                    from bs4 import BeautifulSoup, SoupStrainer

                    parse_only = SoupStrainer(class_='entry')
                    soup = BeautifulSoup(page.text, 'lxml', parse_only=parse_only)


                  • .findNext() is deprecated in favour of .find_next()





                • Other





                  • getUrl() method feels a little bit overcomplicated and you can definitely improve its readability and simplify it at the same time. Plus, I would make it return the URL as a string and not as a list to avoid any extra external code needed to make it a string. If you are using Python 3.6+, f-strings would be handy:



                    def get_url(filename: str) -> str:
                    """Constructs a complete URL based on the filename and current date."""
                    current_interval = parse_url(filename)

                    now = datetime.datetime.now()

                    return f"https://gematsu.com/{now.year}/{now.month}/media-create-sales-{current_interval}"


                    Having the complete URL string with placeholders helps to see what the end result might look like.






                Note that most of the PEP8 violations mentioned above could be caught by running a linter (like flake8, or pylint) against your code.






                share|improve this answer














                This is a very good attempt for the first Python venture!





                Here is what I would consider improving:





                • stylistic / PEP8





                  • naming. In Python, it is recommended to use lower_case_with_underscores notation. Your function and variable names are inconsistent and there is a mix of styles - e.g. cur_startDateString follows both camelCase and lower_case_with_underscores


                  • the user of blank lines. In Python, it is recommended to have 2 blank lines between top-level blocks of the code and have an extra new-line at the end of the script


                  • the use of whitespaces. Watch for whitespaces after commas in expressions, e.g.:



                    for idx,entry in enumerate(gameEntries,start=1):
                    ^ ^


                  • import grouping and order. See Imports section of the PEP8


                  • parenthesis around the return value are redundant

                  • consider adding docstrings explaining what your functions are about. For instance, getLatestFilename would benefit from a note about how does it define which file is the latest. And, parseUrl is not really easy to understand just by reading the code

                  • if you are using Python 3.5+, consider adding Type Hints




                • Design and modularity




                  • it is a good thing that you tried to logically separate scraping from parsing

                  • I would also separate parsing and dumping the results into a CSV file




                • HTML Parsing




                  • if performance is a concern, consider switching from html.parser to lxml - see "Installing a Parser" section of the documentation


                  • you could also improve performance of the parsing stage by using a SoupStrainer which allows you to parse the desired part of the HTML tree in the first place:



                    from bs4 import BeautifulSoup, SoupStrainer

                    parse_only = SoupStrainer(class_='entry')
                    soup = BeautifulSoup(page.text, 'lxml', parse_only=parse_only)


                  • .findNext() is deprecated in favour of .find_next()





                • Other





                  • getUrl() method feels a little bit overcomplicated and you can definitely improve its readability and simplify it at the same time. Plus, I would make it return the URL as a string and not as a list to avoid any extra external code needed to make it a string. If you are using Python 3.6+, f-strings would be handy:



                    def get_url(filename: str) -> str:
                    """Constructs a complete URL based on the filename and current date."""
                    current_interval = parse_url(filename)

                    now = datetime.datetime.now()

                    return f"https://gematsu.com/{now.year}/{now.month}/media-create-sales-{current_interval}"


                    Having the complete URL string with placeholders helps to see what the end result might look like.






                Note that most of the PEP8 violations mentioned above could be caught by running a linter (like flake8, or pylint) against your code.







                share|improve this answer














                share|improve this answer



                share|improve this answer








                edited 18 mins ago

























                answered 23 mins ago









                alecxe

                14.5k53277




                14.5k53277






























                    draft saved

                    draft discarded




















































                    Thanks for contributing an answer to Code Review Stack Exchange!


                    • Please be sure to answer the question. Provide details and share your research!

                    But avoid



                    • Asking for help, clarification, or responding to other answers.

                    • Making statements based on opinion; back them up with references or personal experience.


                    Use MathJax to format equations. MathJax reference.


                    To learn more, see our tips on writing great answers.





                    Some of your past answers have not been well-received, and you're in danger of being blocked from answering.


                    Please pay close attention to the following guidance:


                    • Please be sure to answer the question. Provide details and share your research!

                    But avoid



                    • Asking for help, clarification, or responding to other answers.

                    • Making statements based on opinion; back them up with references or personal experience.


                    To learn more, see our tips on writing great answers.




                    draft saved


                    draft discarded














                    StackExchange.ready(
                    function () {
                    StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f209575%2fa-simple-web-scraper%23new-answer', 'question_page');
                    }
                    );

                    Post as a guest















                    Required, but never shown





















































                    Required, but never shown














                    Required, but never shown












                    Required, but never shown







                    Required, but never shown

































                    Required, but never shown














                    Required, but never shown












                    Required, but never shown







                    Required, but never shown







                    Popular posts from this blog

                    404 Error Contact Form 7 ajax form submitting

                    How to know if a Active Directory user can login interactively

                    TypeError: fit_transform() missing 1 required positional argument: 'X'