爬取中国大学排名并作可视化分析 导读 爬取中国大学排名 可视

本文已参与「掘力星计划」,赢取创作大礼包,挑战创作激励金

导读

肥友们,最近有位粉丝找到我直言出价让我代做作业,我一听当场就急眼了。我肥学是这样的人吗?


直接就问他给多少钱,嘻嘻嘻!!!当然了多少钱不也不会干的,既然是粉丝我肯定尽量帮啊,于是我就开始了今天的博客。

爬取中国大学排名

链接:中国大学排名
其实还是挺简单的,这位粉丝肯定没有好好看我以前的文章,这种爬取说过很多次了。所以我们直接整起来。


我们直接找到要获得的这些信息的id或者class

1
2
3
4
5
6
7
8
python复制代码res=requests.get(url=url,headers=header).content.decode('utf-8')
soup=BeautifulSoup(res,'lxml')
names=soup.findAll(name="a",attrs={"class":"name-cn"})
xinxi=soup.findAll("td")
with open("中国大学.csv", 'a', encoding="utf-8", newline="") as f:
for i in range(0,len(xinxi),6):
w=csv.writer(f)
w.writerow([xinxi[i].text.strip(),xinxi[i+1].text.strip(),xinxi[i+2].text.strip(),xinxi[i+3].text.strip(),xinxi[i+4].text.strip(),xinxi[i+5].text.strip()])

然后顺利拿到信息


可视化分析
=====

词云

上面我们以经拿到了信息,我们先对这些大学集中的省市和类别做一个词云分析


可以看出来综合和北京的居多

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
python复制代码import jieba
from wordcloud import WordCloud
from matplotlib import colors
import csv
import pandas as pd

info=pd.read_csv("中国大学.csv",usecols=[2,3])
text=info
cut_text = "".join(jieba.cut(str(text)))
color_list=['#FF0000','#9955FF','#66FFFF']#建立颜色数组
colormap=colors.ListedColormap(color_list)#调用
#color_mask = cv2.imread("11.jpg")
word_cloud=WordCloud(
font_path="msyh.ttc",
background_color='black',
mode="RGBA",
prefer_horizontal=1,
#mask=color_mask,
height=200,
width=200,
scale=1,
colormap=colormap,#设置颜色
margin=5
)

word_cloud1=word_cloud.generate(cut_text)
word_cloud1.to_file('2.png')

print("图片保存成功")

条形统计图


这里运用了Echarts做得有兴趣的大佬也可以用cufflinks做也可以达到同样的效果

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
javascript复制代码//这里只把js可变部分贴了出来
option = {
title: {
text: '中国大学数据'
},
tooltip: {
trigger: 'axis'
},
legend: {
data: ['总分', '办学层次']
},
grid: {
left: '3%',
right: '4%',
bottom: '3%',
containLabel: true
},
toolbox: {
feature: {
saveAsImage: {}
}
},
xAxis: {
type: 'category',
boundaryGap: false,
data: [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
},
yAxis: {
type: 'value'
},
series: [

{
name: '总分',
type: 'line',
stack: 'Total',
data: [969.2,855.3,768.7,723.4,654.8,649.7,577.0,574.3,567.9,537.9,522.6,519.3,518.3,516.6,513.8,508.3,488.1,487.8,474.0,465.3,447.0,444.3,442.2,435.7,430.5,427.8,419.8,418.2,401.8,400.4]
},
{
name: '办学层次',
type: 'line',
stack: 'Total',
data: [37.9,36.1,34.3,35.5,35.1,36.6,40.0,32.1,31.8,34.5,32.7,30.9,34.8,30.7,32.8,33.2,34.3,34.5,32.3,31.5,28.8,32.7,30.8,30.4,32.4,32.7,30.5,30.2,35.2,31.8]
}
]
};

获取全国211以上大学的动态地理坐标

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
python复制代码plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['animation.writer'] = 'html'
plt.rcParams['animation.embed_limit'] = 100


def rgba_to_rgb(img_rgba):
img_rgb = Image.new("RGB", img_rgba.size, (255, 255, 255))
img_rgb.paste(img_rgba, mask=img_rgba.split()[3])
return img_rgb


def html_to_gif(html_file, gif_file, duration=0.5):
path = html_file.replace(".html", "_frames")
images = [os.path.join(path, x) for x in sorted(os.listdir(path))]
frames = [imageio.imread(x) for x in images]
if frames[0].shape[-1] == 4:
frames = [np.array(rgba_to_rgb(Image.fromarray(x))) for x in frames]
imageio.mimsave(gif_file, frames, 'gif', duration=duration)
return gif_file

cmap = ['#2E91E5','#1CA71C','#DA16FF','#B68100','#EB663B','#00A08B','#FC0080','#6C7C32','#862A16','#620042','#DA60CA','#0D2A63'] * 100

def getCoords(geom):
if isinstance(geom, geo.MultiPolygon):
return [np.array(g.exterior) for g in geom.geoms]
elif isinstance(geom, geo.Polygon):
return [np.array(geom.exterior)]
elif isinstance(geom, geo.LineString):
return [np.array(geom)]
elif isinstance(geom, geo.MultiLineString):
return [np.array(x) for x in list(geom.geoms)]
else:
raise Exception("geom must be one of [polygon,MultiPolygon,LineString,MultiLineString]!")


# 底图数据
dfprovince = gpd.read_file("dfprovince.geojson").set_crs("epsg:4326").to_crs("epsg:2343")
dfnanhai = gpd.read_file("dfnanhai.geojson").set_crs("epsg:4326").to_crs("epsg:2343")
dfline9 = dfnanhai[(dfnanhai["LENGTH"] > 1.0) & (dfnanhai["LENGTH"] < 2.0)]

# 散点数据
df985 = gpd.read_file("中国985大学.geojson").set_crs("epsg:4326").to_crs("epsg:2343")
df211 = gpd.read_file("中国211大学.geojson").set_crs("epsg:4326").to_crs("epsg:2343")
dfpoints = pd.concat([df985, df211], axis=0)
df = pd.DataFrame({"x": [pt.x for pt in dfpoints["geometry"]],
"y": [pt.y for pt in dfpoints["geometry"]]})
df["z"] = 1.0
df.index = dfpoints["name"].values


def bubble_map_dance(df, title="中国116所211高校位置分布",
filename=None,
figsize=(8, 6), dpi=144,
duration=0.5,
anotate_points=["北京邮电大学", "南昌大学", "华中农业大学", "东华大学", "云南大学",
"陕西师范大学", "内蒙古大学", "西藏大学", "新疆大学", "青海大学", "哈尔滨工程大学"]):
fig, ax_base = plt.subplots(figsize=figsize, dpi=dpi)
ax_child = fig.add_axes([0.800, 0.125, 0.10, 0.20])

def plot_frame(i):

ax_base.clear()
ax_child.clear()
# 绘制省边界
polygons = [getCoords(x) for x in dfprovince["geometry"]]
for j, coords in enumerate(polygons):
for x in coords:
poly = plt.Polygon(x, fill=True, ec="gray", fc="white", alpha=0.5, linewidth=.8)
poly_child = plt.Polygon(x, fill=True, ec="gray", fc="white", alpha=0.5, linewidth=.8)
ax_base.add_patch(poly)
ax_child.add_patch(poly_child)

# 绘制九段线
coords = [getCoords(x) for x in dfline9["geometry"]]
lines = [y for x in coords for y in x]
for ln in lines:
x, y = np.transpose(ln)
line = plt.Line2D(x, y, color="gray", linestyle="-.", linewidth=1.5)
line_child = plt.Line2D(x, y, color="gray", linestyle="-.", linewidth=1.5)
ax_base.add_artist(line)
ax_child.add_artist(line_child)

# 设置spine格式
for spine in ['top', 'left', "bottom", "right"]:
ax_base.spines[spine].set_color("none")
ax_child.spines[spine].set_alpha(0.5)
ax_base.axis("off")

# 设置绘图范围
bounds = dfprovince.total_bounds
ax_base.set_xlim(bounds[0] - (bounds[2] - bounds[0]) / 10, bounds[2] + (bounds[2] - bounds[0]) / 10)
ax_base.set_ylim(bounds[1] + (bounds[3] - bounds[1]) / 3.5, bounds[3] + (bounds[3] - bounds[1]) / 100)

ax_child.set_xlim(bounds[2] - (bounds[2] - bounds[0]) / 2.5, bounds[2] - (bounds[2] - bounds[0]) / 20)
ax_child.set_ylim(bounds[1] - (bounds[3] - bounds[1]) / 20, bounds[1] + (bounds[3] - bounds[1]) / 2)

# 移除坐标轴刻度
ax_child.set_xticks([]);
ax_child.set_yticks([]);
k = i // 3 + 1
m = i % 3
text = "NO." + str(k)

dfdata = df.iloc[:k, :].copy()
dftmp = df.iloc[:k - 1, :].copy()

# 绘制散点图像
if len(dftmp) > 0:
ax_base.scatter(dftmp["x"], dftmp["y"], s=100 * dftmp["z"] / df["z"].mean(),
c=(cmap * 100)[0:len(dftmp)], alpha=0.3, zorder=3)
ax_child.scatter(dftmp["x"], dftmp["y"], s=100 * dftmp["z"] / df["z"].mean(),
c=(cmap * 100)[0:len(dftmp)], alpha=0.3, zorder=3)

# 添加注释文字
for i, p in enumerate(dftmp.index):
px, py, pz = dftmp.loc[p, ["x", "y", "z"]].tolist()
if p in anotate_points:
ax_base.annotate(p, xy=(px, py), xycoords="data", xytext=(-15, 10),
fontsize=10, fontweight="bold", color=cmap[i], textcoords="offset points")

# 添加标题和排名序号
# ax_base.set_title(title,color = "black",fontsize = 12)
ax_base.text(0.5, 0.95, title, va="center", ha="center",
size=12, transform=ax_base.transAxes)
ax_base.text(0.5, 0.5, text, va="center", ha="center",
alpha=0.3, size=50, transform=ax_base.transAxes)

# 添加注意力动画
if m == 0:
px, py, pz = dfdata["x"][[-1]], dfdata["y"][[-1]], dfdata["z"][-1]
p = dfdata.index[-1]
ax_base.scatter(px, py, s=800 * pz / df["z"].mean(),
c=cmap[len(dfdata) - 1:len(dfdata)], alpha=0.5, zorder=4)
ax_base.annotate(p, xy=(px, py), xycoords="data",
xytext=(-15, 10), fontsize=20, fontweight="bold",
color=cmap[k - 1], textcoords="offset points", zorder=5)

if m == 1:
px, py, pz = dfdata["x"][[-1]], dfdata["y"][[-1]], dfdata["z"][-1]
p = dfdata.index[-1]
ax_base.scatter(px, py, s=400 * pz / df["z"].mean(),
c=cmap[len(dfdata) - 1:len(dfdata)], alpha=0.5, zorder=4)
ax_base.annotate(p, xy=(px, py), xycoords="data",
xytext=(-15, 10), fontsize=15, fontweight="bold",
color=cmap[k - 1], textcoords="offset points", zorder=5)

if m == 2:
px, py, pz = dfdata["x"][[-1]], dfdata["y"][[-1]], dfdata["z"][-1]
p = dfdata.index[-1]
ax_base.scatter(px, py, s=100 * pz / df["z"].mean(),
c=cmap[len(dfdata) - 1:len(dfdata)], alpha=0.5, zorder=4)
ax_base.annotate(p, xy=(px, py), xycoords="data",
xytext=(-15, 10), fontsize=10, fontweight="bold",
color=cmap[k - 1], textcoords="offset points", zorder=5)

my_animation = animation.FuncAnimation(fig, plot_frame, frames=range(0, 3 * len(df)), interval=int(duration * 1000))

if filename is None:
try:
from IPython.display import HTML
HTML(my_animation.to_jshtml())
return HTML(my_animation.to_jshtml())
except ImportError:
pass
else:
my_animation.save(filename)
return filename

最后对肥友说

为了粉丝这把拼了 我觉得真的收费的大概也就这样了吧,好几天没有写python了这把直接给我整爽了。持续关注我后面Javapythonweb都给大家整一套。最后还是那句话一起肥学,一起加油

本文转载自: 掘金

开发者博客 – 和开发相关的 这里全都有

0%