1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import bs4
import requests

# 设置请求头,模拟浏览器访问,以避免被网站反爬虫机制阻止
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
}
print("豆瓣评分top250的电影:")
for start_num in range(0, 250, 25):
# 构造豆瓣电影Top250的URL,start参数用于分页
response = requests.get(f"https://movie.douban.com/top250?start={start_num}",headers=headers)
# 获取响应内容
text = response.text
# 使用BeautifulSoup解析HTML文档
soup = bs4.BeautifulSoup(text, "lxml")
#span 标签下 class="title"
all_movies = soup.find_all("span", {"class": "title"})
i = start_num+1
for movie in all_movies:
if movie.string[1]== '/':
continue
s = str(i)+":"+movie.string
print(s)
i+=1

加入评分

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import bs4
import requests

# 设置请求头,模拟浏览器访问,以避免被网站反爬虫机制阻止
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
}
print("豆瓣评分top250的电影:")
for start_num in range(0, 250, 25):
# 构造豆瓣电影Top250的URL,start参数用于分页
response = requests.get(f"https://movie.douban.com/top250?start={start_num}", headers=headers)
# 获取响应内容
text = response.text
# 使用BeautifulSoup解析HTML文档
soup = bs4.BeautifulSoup(text, "lxml")

all_movies = soup.find_all("div", {"class": "item"})

for idx, movie in enumerate(all_movies, start=start_num+1):
title_tag = movie.find("span", {"class": "title"})
star_tag = movie.find("span", {"class": "rating_num"})
title = title_tag.get_text(strip=True) if title_tag else "未知电影"
star = star_tag.get_text(strip=True) if star_tag else "未知评分"
#strip=True参数去除任何前导或尾随的空白字符。
print(f"{idx}:{title}\n评分:{star}")

加入持久化存储

导出为excel表

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# @Time    : 2024/10/29 下午4:25
# @Author : 于嘉琪
# @FileName: 持久化保存excel
# @Blog :www.shuaishuaiqi.top

import bs4
import requests
import csv
import pandas as pd
# 设置请求头,模拟浏览器访问,以避免被网站反爬虫机制阻止
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 "
"Safari/537.36"
}
movies_df = pd.DataFrame(columns=['排名', '电影标题', '评分', '评价'])
print("豆瓣评分top250的电影:")

for start_num in range(0, 250, 25):
# 构造豆瓣电影Top250的URL,start参数用于分页
response = requests.get(f"https://movie.douban.com/top250?start={start_num}", headers=headers)
# 获取响应内容
text = response.text
# 使用BeautifulSoup解析HTML文档
soup = bs4.BeautifulSoup(text, "lxml")

all_movies = soup.find_all("div", {"class": "item"})

for idx, movie in enumerate(all_movies, start=start_num + 1):
title_tag = movie.find("span", {"class": "title"})
star_tag = movie.find("span", {"class": "rating_num"})
appraise_tag = movie.find("span", {"class": "inq"})
title = title_tag.get_text(strip=True) if title_tag else "未知电影"
star = star_tag.get_text(strip=True) if star_tag else "未知评分"
appraise = appraise_tag.get_text(strip=True) if appraise_tag else "未知评价"
# strip=True参数去除任何前导或尾随的空白字符。
movies_df = movies_df.append({
'排名': idx,
'电影标题': title,
'评分': star,
'评价': appraise
}, ignore_index=True)
print(f"{idx}:{title}\n打分:{star}\n评价:{appraise}")
movies_df.to_excel('douban_top250_movies.xlsx', index=False, engine='openpyxl',encoding='utf-8')

print("数据已保存到douban_top250_movies.xlsx文件中。")

导出为csv文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import bs4
import requests
import csv

# 设置请求头,模拟浏览器访问,以避免被网站反爬虫机制阻止
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 "
"Safari/537.36"
}

with open('douban_top250_movies.csv', mode='w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
print("豆瓣评分top250的电影:")
# 写入CSV文件的标题行
writer.writerow(['排名', '电影标题', '评分', '评价人数'])
for start_num in range(0, 250, 25):
# 构造豆瓣电影Top250的URL,start参数用于分页
response = requests.get(f"https://movie.douban.com/top250?start={start_num}", headers=headers)
# 获取响应内容
text = response.text
# 使用BeautifulSoup解析HTML文档
soup = bs4.BeautifulSoup(text, "lxml")

all_movies = soup.find_all("div", {"class": "item"})

for idx, movie in enumerate(all_movies, start=start_num + 1):
title_tag = movie.find("span", {"class": "title"})
star_tag = movie.find("span", {"class": "rating_num"})
appraise_tag = movie.find("span", {"class": "inq"})
title = title_tag.get_text(strip=True) if title_tag else "未知电影"
star = star_tag.get_text(strip=True) if star_tag else "未知评分"
appraise = appraise_tag.get_text(strip=True) if appraise_tag else "未知评价"
# strip=True参数去除任何前导或尾随的空白字符。
writer.writerow([idx, title, star, appraise])
print(f"{idx}:{title}\n打分:{star}\n评价:{appraise}")
print("数据已保存到douban_top250_movies.csv文件中。")

导出到mongodb数据库

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import bs4
import requests
import csv
from pymongo import MongoClient

# 设置请求头,模拟浏览器访问,以避免被网站反爬虫机制阻止
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 "
"Safari/537.36"
}

client = MongoClient('localhost', 27017) #创建实例
db = client['douban_movies'] #选择数据库
collection = db['top250'] #选择集合

print("豆瓣评分top250的电影:")
for start_num in range(0, 250, 25):
# 构造豆瓣电影Top250的URL,start参数用于分页
response = requests.get(f"https://movie.douban.com/top250?start={start_num}", headers=headers)
# 获取响应内容
text = response.text
# 使用BeautifulSoup解析HTML文档
soup = bs4.BeautifulSoup(text, "lxml")

all_movies = soup.find_all("div", {"class": "item"})

for idx, movie in enumerate(all_movies, start=start_num + 1):
title_tag = movie.find("span", {"class": "title"})
star_tag = movie.find("span", {"class": "rating_num"})
appraise_tag = movie.find("span", {"class": "inq"})
title = title_tag.get_text(strip=True) if title_tag else "未知电影"
star = star_tag.get_text(strip=True) if star_tag else "未知评分"
appraise = appraise_tag.get_text(strip=True) if appraise_tag else "未知评价"
# strip=True参数去除任何前导或尾随的空白字符。
print(f"{idx}:{title}\n打分:{star}\n评价:{appraise}")
collection.insert_one({
"排名": idx,
"电影标题": title,
"评分": star,
"评价": appraise
})
print("数据已保存到MongoDB中。")