Seeking better way of Iterating through data in a webcrawler?

I've been experimenting more with webcrawling and hence have started to get a better understanding compared to my previous questions. Right now, my code scraps from a car forum on each page; and iterates through every pages. What would you recommend to improve on?

from requests import get

from bs4 import BeautifulSoup, SoupStrainer

import pandas as pd





def get_response(url):

    # Gets the <html> structure from the website #

    response = get(url)

    soup = BeautifulSoup(response.text, 'lxml',

                         parse_only=SoupStrainer('ul', {'class': 'posts posts-archive'}))

    return soup





def iteration(url, max_page=52):

    starting_page = 1

    while starting_page <= max_page:

        ## formats the new URL etc (https://paultan.org/topics/test-drive-reviews/page/1) ##

        new_url = url + f"page/{starting_page}"

        data = get_response(new_url)

        pd_data = get_reviews_title(data)

        print(create_pdReview(pd_data))

        ## iteration starts ##

        starting_page += 1





def get_reviews_title(response):

    return[(

        container.h2.a.text, container.time.text)

        for container in response('article')

    ]





def create_pdReview(data):

    return pd.DataFrame(data, columns=['Title', 'Date'])





if __name__ == '__main__':

    URL = 'https://paultan.org/topics/test-drive-reviews/'

    print(iteration(URL))

enter code here

I've been wondering; would using yield improve the efficiency and simplicity of the code? How would it be done? Because I've been trying to learn from my previous inquiries that has been answered on earlier.
Similar Question

asked 6 mins ago

Minial

425

New contributor

add a comment |

from requests import get

from bs4 import BeautifulSoup, SoupStrainer

import pandas as pd





def get_response(url):

    # Gets the <html> structure from the website #

    response = get(url)

    soup = BeautifulSoup(response.text, 'lxml',

                         parse_only=SoupStrainer('ul', {'class': 'posts posts-archive'}))

    return soup





def iteration(url, max_page=52):

    starting_page = 1

    while starting_page <= max_page:

        ## formats the new URL etc (https://paultan.org/topics/test-drive-reviews/page/1) ##

        new_url = url + f"page/{starting_page}"

        data = get_response(new_url)

        pd_data = get_reviews_title(data)

        print(create_pdReview(pd_data))

        ## iteration starts ##

        starting_page += 1





def get_reviews_title(response):

    return[(

        container.h2.a.text, container.time.text)

        for container in response('article')

    ]





def create_pdReview(data):

    return pd.DataFrame(data, columns=['Title', 'Date'])





if __name__ == '__main__':

    URL = 'https://paultan.org/topics/test-drive-reviews/'

    print(iteration(URL))

enter code here

asked 6 mins ago

Minial

425

New contributor

add a comment |

from requests import get

from bs4 import BeautifulSoup, SoupStrainer

import pandas as pd





def get_response(url):

    # Gets the <html> structure from the website #

    response = get(url)

    soup = BeautifulSoup(response.text, 'lxml',

                         parse_only=SoupStrainer('ul', {'class': 'posts posts-archive'}))

    return soup





def iteration(url, max_page=52):

    starting_page = 1

    while starting_page <= max_page:

        ## formats the new URL etc (https://paultan.org/topics/test-drive-reviews/page/1) ##

        new_url = url + f"page/{starting_page}"

        data = get_response(new_url)

        pd_data = get_reviews_title(data)

        print(create_pdReview(pd_data))

        ## iteration starts ##

        starting_page += 1





def get_reviews_title(response):

    return[(

        container.h2.a.text, container.time.text)

        for container in response('article')

    ]





def create_pdReview(data):

    return pd.DataFrame(data, columns=['Title', 'Date'])





if __name__ == '__main__':

    URL = 'https://paultan.org/topics/test-drive-reviews/'

    print(iteration(URL))

enter code here

asked 6 mins ago

Minial

425

New contributor

from requests import get

from bs4 import BeautifulSoup, SoupStrainer

import pandas as pd





def get_response(url):

    # Gets the <html> structure from the website #

    response = get(url)

    soup = BeautifulSoup(response.text, 'lxml',

                         parse_only=SoupStrainer('ul', {'class': 'posts posts-archive'}))

    return soup





def iteration(url, max_page=52):

    starting_page = 1

    while starting_page <= max_page:

        ## formats the new URL etc (https://paultan.org/topics/test-drive-reviews/page/1) ##

        new_url = url + f"page/{starting_page}"

        data = get_response(new_url)

        pd_data = get_reviews_title(data)

        print(create_pdReview(pd_data))

        ## iteration starts ##

        starting_page += 1





def get_reviews_title(response):

    return[(

        container.h2.a.text, container.time.text)

        for container in response('article')

    ]





def create_pdReview(data):

    return pd.DataFrame(data, columns=['Title', 'Date'])





if __name__ == '__main__':

    URL = 'https://paultan.org/topics/test-drive-reviews/'

    print(iteration(URL))

enter code here

python python-3.x beautifulsoup

asked 6 mins ago

Minial

425

New contributor

asked 6 mins ago

Minial

425

New contributor

asked 6 mins ago

Minial

425

New contributor

asked 6 mins ago

Minial

425

asked 6 mins ago

Minial

425

New contributor

Minial is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.

add a comment |

0

active

oldest

votes

Your Answer

StackExchange.ifUsing("editor", function () {
return StackExchange.using("mathjaxEditing", function () {
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
});
});
}, "mathjax-editing");

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "196"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: false,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

Minial is a new contributor. Be nice, and check out our Code of Conduct.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f211458%2fseeking-better-way-of-iterating-through-data-in-a-webcrawler%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

0

active

oldest

votes

0

active

oldest

votes

Minial is a new contributor. Be nice, and check out our Code of Conduct.

draft saved

draft discarded

Minial is a new contributor. Be nice, and check out our Code of Conduct.

Thanks for contributing an answer to Code Review Stack Exchange!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

Use MathJax to format equations. MathJax reference.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

搜尋此網誌

Tukukkk