基于参考资料《Python 网络爬虫从入门到实践》改编而成的用于爬取 typecho 框架下 handsome 主题的主标题信息的爬虫代码。

#用于爬取typecho框架下handsome主题的主标题信息
#Author:harumonia

import requests
from bs4 import BeautifulSoup

def get_movies():
    headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
    }
    movie_list1 = []
    #movie_list2 = []

    for i in range(1,10):
        link = 'https://www.moerats.com/page/' + str(i)+'/'
        r = requests.get(link, headers=headers, timeout= 10)
        print (str(i),"页响应状态码:", r.status_code)

        soup = BeautifulSoup(r.text, "lxml")
        div_list1 = soup.find_all('div', class_='post-meta wrapper-lg')
        #div_list2 = soup.find_all('div', class_='bd')

        for each in div_list1:
            movie = each.h2.a.text.strip()
            #print(each.a.span.next_sibling.next_sibling.text)
            movie_list1.append(movie)
        '''
        for each in div_list2:
            director=each.p.text.strip()
            movie_list2.append(director)
        '''
    return movie_list1

movies = get_movies()
for i in movies:
    print (i,end='\n')
#print (movies2)